280 files changed, 6912 insertions, 2764 deletions
diff --git a/autoconf/configure.ac b/autoconf/configure.ac
index 090f5dc..d777553 100644
--- a/autoconf/configure.ac
+++ b/autoconf/configure.ac
@@ -1293,7 +1293,7 @@ AC_CHECK_FUNCS([backtrace ceilf floorf roundf rintf nearbyintf getcwd ])
 AC_CHECK_FUNCS([powf fmodf strtof round ])
 AC_CHECK_FUNCS([getpagesize getrusage getrlimit setrlimit gettimeofday ])
 AC_CHECK_FUNCS([isatty mkdtemp mkstemp ])
-AC_CHECK_FUNCS([mktemp posix_spawn realpath sbrk setrlimit strdup ])
+AC_CHECK_FUNCS([mktemp posix_spawn pread realpath sbrk setrlimit strdup ])
 AC_CHECK_FUNCS([strerror strerror_r setenv ])
 AC_CHECK_FUNCS([strtoll strtoq sysconf malloc_zone_statistics ])
 AC_CHECK_FUNCS([setjmp longjmp sigsetjmp siglongjmp writev])
diff --git a/cmake/config-ix.cmake b/cmake/config-ix.cmake
index 62699fa..0943ae7 100755
--- a/cmake/config-ix.cmake
+++ b/cmake/config-ix.cmake
@@ -126,6 +126,8 @@ check_symbol_exists(readdir "sys/types.h;dirent.h" HAVE_READDIR)
 check_symbol_exists(getcwd unistd.h HAVE_GETCWD)
 check_symbol_exists(gettimeofday sys/time.h HAVE_GETTIMEOFDAY)
 check_symbol_exists(getrlimit "sys/types.h;sys/time.h;sys/resource.h" HAVE_GETRLIMIT)
+check_symbol_exists(posix_spawn spawn.h HAVE_POSIX_SPAWN)
+check_symbol_exists(pread unistd.h HAVE_PREAD)
 check_symbol_exists(rindex strings.h HAVE_RINDEX)
 check_symbol_exists(strchr string.h HAVE_STRCHR)
 check_symbol_exists(strcmp string.h HAVE_STRCMP)
diff --git a/configure b/configure
index 09173d2..bfd61dd 100755
--- a/configure
+++ b/configure
@@ -17151,7 +17151,8 @@ done
 
 
 
-for ac_func in mktemp posix_spawn realpath sbrk setrlimit strdup
+
+for ac_func in mktemp posix_spawn pread realpath sbrk setrlimit strdup
 do
 as_ac_var=`echo "ac_cv_func_$ac_func" | $as_tr_sh`
 { echo "$as_me:$LINENO: checking for $ac_func" >&5
diff --git a/docs/CMake.html b/docs/CMake.html
index 087d693..c822ed7 100644
--- a/docs/CMake.html
+++ b/docs/CMake.html
@@ -145,7 +145,7 @@
       text. Generator's names are case-sensitive. Example:</p>
 
     <div class="doc_code">
-      <p><tt>cmake -G "Visual Studio 8 2005" path/to/llvm/source/root</tt></p>
+      <p><tt>cmake -G "Visual Studio 9 2008" path/to/llvm/source/root</tt></p>
     </div>
 
     <p>For a given development platform there can be more than one
diff --git a/docs/GettingStarted.html b/docs/GettingStarted.html
index 0a6a481..d78344a 100644
--- a/docs/GettingStarted.html
+++ b/docs/GettingStarted.html
@@ -283,7 +283,7 @@ software you will need.</p>
 <tr>
   <td>Windows</td>
   <td>x86<sup><a href="#pf_1">1</a></sup></td>
-  <td>Visual Studio 2005 SP1 or higher<sup><a href="#pf_4">4</a>,<a href="#pf_5">5</a></sup></td>
+  <td>Visual Studio 2008 or higher<sup><a href="#pf_4">4</a>,<a href="#pf_5">5</a></sup></td>
 <tr>
   <td>AIX<sup><a href="#pf_3">3</a>,<a href="#pf_4">4</a></sup></td>
   <td>PowerPC</td>
diff --git a/docs/GettingStartedVS.html b/docs/GettingStartedVS.html
index d6bf1b6..8c86ec6 100644
--- a/docs/GettingStartedVS.html
+++ b/docs/GettingStartedVS.html
@@ -88,8 +88,8 @@
 
 <div>
 
-  <p>Any system that can adequately run Visual Studio .NET 2005 SP1 is fine.
-  The LLVM source tree and object files, libraries and executables will consume
+  <p>Any system that can adequately run Visual Studio 2008 is fine. The LLVM
+  source tree and object files, libraries and executables will consume
   approximately 3GB.</p>
 
 </div>
@@ -98,10 +98,9 @@
 <h3><a name="software"><b>Software</b></a></h3>
 <div>
 
-  <p>You will need Visual Studio .NET 2005 SP1 or higher.  The VS2005 SP1
-  beta and the normal VS2005 still have bugs that are not completely
-  compatible.  Earlier versions of Visual Studio do not support the C++ standard
-  well enough and will not work.</p>
+  <p>You will need Visual Studio 2008 or higher.  Earlier versions of Visual
+  Studio have bugs, are not completely compatible, or do not support the C++
+  standard well enough.</p>
 
   <p>You will also need the <a href="http://www.cmake.org/">CMake</a> build
   system since it generates the project files you will use to build with.</p>
diff --git a/docs/ObjCPropertyDebugInfo.html b/docs/ObjCPropertyDebugInfo.html
deleted file mode 100644
index c1482cf..0000000
--- a/docs/ObjCPropertyDebugInfo.html
+++ /dev/null
@@ -1,237 +0,0 @@
-<!DOCTYPE HTML PUBLIC "-//W3C//DTD HTML 4.01//EN"
-                      "http://www.w3.org/TR/html4/strict.dtd">
-<html>
-<head>
-  <meta http-equiv="Content-Type" content="text/html; charset=utf-8">
-  <title>Debugging Information Extension for Objective C Properties</title>
-  <link rel="stylesheet" href="llvm.css" type="text/css">
-</head>
-<body>
-
-<h1>
-  Debugging Information Extension for Objective C Properties
-</h1>
-
-<ol>
-  <li><a href="#introduction">Introduction</a></li>
-  <li><a href="#proposal">Proposal</a></li>
-  <li><a href="#newattributes">New DWARF Attributes</a></li>
-  <li><a href="#newconstants">New DWARF Constants</a></li>
-
-</ol>
-
-<div class="doc_author">
-  <p>Written by Jim Ingham and Devang Patel </p>
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="introduction">Introduction</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<p>Objective C provides a simpler way to declare and define accessor methods 
-using declared properties. The language provides features to declare a 
-property and to let compiler synthesize accessor methods.
-</p>
-
-<p>The debugger lets developer inspect Objective C interfaces and their 
-instance variables and class variables. However, the debugger does not know 
-anything about the properties defined in Objective C interfaces. The debugger
-consumes information generated by compiler in DWARF format. The format does 
-not support encoding of Objective C properties. This proposal describes DWARF
-extensions to encode Objective C properties, which the debugger can use to let
-developers inspect Objective C properties.
-</p>
-
-</div>
-
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="proposal">Proposal</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<p>Objective C properties are always backed by an instance variable. The 
-instance variables backing properties are identified using 
-DW_AT_APPLE_property_name attribute. The instance variables with this 
-attribute may not have data location attributes. The location of instance 
-variables is determined by debugger only after consulting Objective C runtime.
-</p>
-
-<div class="doc_code">
-<pre>
-@interface I1 { 
-  int n2;
-} 
-
-@property p1; 
-@property p2; 
-@end
-
-@implementation I1 
-@synthesize p1; 
-@synthesize p2 = n2; 
-@end
-
-
-TAG_structure_type [7] * 
-  AT_APPLE_runtime_class( 0x10 )
-  AT_name( "I1" )
-  AT_decl_file( "Objc_Property.m" ) 
-  AT_decl_line( 3 )
-
-  TAG_member [8] 
-    AT_name( "p1" )
-    AT_APPLE_property_name(“p1”) 
-    AT_type( {0x00000147} ( int ) )
-
-  TAG_member [8] 
-    AT_name( "n2" )
-    AT_APPLE_property_name(“p2”) 
-    AT_type( {0x00000147} ( int ) )
-</pre>
-</div>
-
-<p> Developers can decorate a property with attributes which are encoded using 
-DW_AT_APPLE_property_attribute.
-</p>
-
-<div class="doc_code">
-<pre>
-@property (readonly, nonatomic) int pr;
-
-
-TAG_member [8] 
-  AT_name(“pr”) 
-  AT_APPLE_property_name(“pr”) 
-  AT_type ( {0x00000147} (int) ) 
-  AT_APPLE_property_attribute (DW_APPLE_PROPERTY_readonly, DW_APPLE_PROPERTY_nonatomic)
-</pre>
-</div>
-
-<p> The setter and getter method names are attached to the property using 
-DW_AT_APPLE_property_setter and DW_AT_APPLE_property_getter attributes.
-</p>
-<div class="doc_code">
-<pre>
-@interface I1 
-@property (setter=myOwnP3Setter:) int p3; 
--(void)myOwnP3Setter:(int)a; 
-@end
-
-@implementation I1 
-@synthesize p3;
--(void)myOwnP3Setter:(int)a{ } 
-@end
-
-0x000003bd: TAG_structure_type [7] * 
-              AT_APPLE_runtime_class( 0x10 )
-              AT_name( "I1" )
-              AT_decl_file( "Objc_Property.m" ) 
-              AT_decl_line( 3 )
-0x000003f3: TAG_member [8] 
-              AT_name( "p3" ) 
-              AT_APPLE_property_name(“p3”) 
-              AT_APPLE_property_setter(“myOwnP3Setter:”)
-              AT_type( {0x00000147} ( int ) )
-</pre>
-</div>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="newattributes">New DWARF Attributes</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<table border="1" cellspacing="0">
-  <tr>
-    <th width=200 >Attribute</th>
-    <th width=200 >Value</th>
-    <th width=200 >Classes</th>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_property_name</td>
-    <td width=200 >0x3fe8</td>
-    <td width=200 >String</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_property_getter</td>
-    <td width=200 >0x3fe9</td>
-    <td width=200 >String</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_property_setter</td>
-    <td width=200 >0x3fea</td>
-    <td width=200 >String</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_property_attribute</td>
-    <td width=200 >0x3feb</td>
-    <td width=200 >Constant</td>
-  </tr>
-</table>
-
-</div>
-
-<!-- *********************************************************************** -->
-<h2>
-  <a name="newconstants">New DWARF Constants</a>
-</h2>
-<!-- *********************************************************************** -->
-
-<div>
-<table border="1" cellspacing="0">
-  <tr>
-    <th width=200 >Name</th>
-    <th width=200 >Value</th>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_PROPERTY_readonly</td>
-    <td width=200 >0x1</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_PROPERTY_readwrite</td>
-    <td width=200 >0x2</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_PROPERTY_assign</td>
-    <td width=200 >0x4</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_PROPERTY_retain</td>
-    <td width=200 >0x8</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_PROPERTY_copy</td>
-    <td width=200 >0x10</td>
-  </tr>
-  <tr>
-    <td width=200 >DW_AT_APPLE_PROPERTY_nonatomic</td>
-    <td width=200 >0x20</td>
-  </tr>
-</table>
-
-</div>
-
-<!-- *********************************************************************** -->
-
-<hr>
-<address>
-  <a href="http://jigsaw.w3.org/css-validator/check/referer"><img
-  src="http://jigsaw.w3.org/css-validator/images/vcss-blue" alt="Valid CSS"></a>
-  <a href="http://validator.w3.org/check/referer"><img
-  src="http://www.w3.org/Icons/valid-html401-blue" alt="Valid HTML 4.01"></a>
-
-  <a href="http://llvm.org/">LLVM Compiler Infrastructure</a><br>
-  Last modified: $Date: 2011-11-14 $
-</address>
-
-</body>
-</html>
diff --git a/docs/ProgrammersManual.html b/docs/ProgrammersManual.html
index 2ab1681..5186203 100644
--- a/docs/ProgrammersManual.html
+++ b/docs/ProgrammersManual.html
@@ -85,7 +85,8 @@ option</a></li>
       <li><a href="#dss_set">&lt;set&gt;</a></li>
       <li><a href="#dss_setvector">"llvm/ADT/SetVector.h"</a></li>
       <li><a href="#dss_uniquevector">"llvm/ADT/UniqueVector.h"</a></li>
-      <li><a href="#dss_otherset">Other Set-Like ContainerOptions</a></li>
+      <li><a href="#dss_immutableset">"llvm/ADT/ImmutableSet.h"</a></li>
+      <li><a href="#dss_otherset">Other Set-Like Container Options</a></li>
     </ul></li>
     <li><a href="#ds_map">Map-Like Containers (std::map, DenseMap, etc)</a>
     <ul>
@@ -97,6 +98,7 @@ option</a></li>
       <li><a href="#dss_intervalmap">"llvm/ADT/IntervalMap.h"</a></li>
       <li><a href="#dss_map">&lt;map&gt;</a></li>
       <li><a href="#dss_inteqclasses">"llvm/ADT/IntEqClasses.h"</a></li>
+      <li><a href="#dss_immutablemap">"llvm/ADT/ImmutableMap.h"</a></li>
       <li><a href="#dss_othermap">Other Map-Like Container Options</a></li>
     </ul></li>
     <li><a href="#ds_bit">BitVector-like containers</a>
@@ -1608,6 +1610,29 @@ factors, and produces a lot of malloc traffic.  It should be avoided.</p>
 
 </div>
 
+<!-- _______________________________________________________________________ -->
+<h4>
+  <a name="dss_immutableset">"llvm/ADT/ImmutableSet.h"</a>
+</h4>
+
+<div>
+
+<p>
+ImmutableSet is an immutable (functional) set implementation based on an AVL
+tree.
+Adding or removing elements is done through a Factory object and results in the
+creation of a new ImmutableSet object.
+If an ImmutableSet already exists with the given contents, then the existing one
+is returned; equality is compared with a FoldingSetNodeID.
+The time and space complexity of add or remove operations is logarithmic in the
+size of the original set.
+
+<p>
+There is no method for returning an element of the set, you can only check for
+membership.
+
+</div>
+
 
 <!-- _______________________________________________________________________ -->
 <h4>
@@ -1814,6 +1839,25 @@ it can be edited again.</p>
 
 <!-- _______________________________________________________________________ -->
 <h4>
+  <a name="dss_immutablemap">"llvm/ADT/ImmutableMap.h"</a>
+</h4>
+
+<div>
+
+<p>
+ImmutableMap is an immutable (functional) map implementation based on an AVL
+tree.
+Adding or removing elements is done through a Factory object and results in the
+creation of a new ImmutableMap object.
+If an ImmutableMap already exists with the given key set, then the existing one
+is returned; equality is compared with a FoldingSetNodeID.
+The time and space complexity of add or remove operations is logarithmic in the
+size of the original map.
+
+</div>
+
+<!-- _______________________________________________________________________ -->
+<h4>
   <a name="dss_othermap">Other Map-Like Container Options</a>
 </h4>
 
diff --git a/docs/ReleaseNotes.html b/docs/ReleaseNotes.html
index 8c803cd..f2d8930 100644
--- a/docs/ReleaseNotes.html
+++ b/docs/ReleaseNotes.html
@@ -712,6 +712,34 @@ be used to verify some algorithms.
 
 <p>LLVM 3.0 includes several major new capabilities:</p>
   
+<!-- Near dead:
+   Analysis/RegionInfo.h + Dom Frontiers
+   SparseBitVector: used in LiveVar.
+ 
+ -->
+  
+<!--
+ Type system rewrite.
+ Better performance for Neon code in clang due to SRoA improvements.
+ New regalloc on by default. Lin scan going away in 3.1
+ PGO / builtin_expect improvements (summary needed)
+ Big EH rewrite.
+ AVX support, assembler, compiler and disassembler.
+ IndVar improvements: andy
+ PTX backend improvements: Justin
+ llvm-rtdyld & MC JIT: JimG
+ InstAliases now automatically used in the asmprinter where they are shorter.
+ Integrated assembler on by default for arm/thumb?
+ PostOrder Dominator frontiers were removed.
+ Line Profiling / gcov support
+ EH and debug information produced with CFI directives, yielding smaller executables: http://blog.mozilla.com/respindola/2011/05/12/cfi-directives/
+ X86-64 generates smaller and faster code at -O0 (fast isel improvements)
+ Better code generation for Cortex-A9
+ Many APIs take ArrayRef's now.
+ Pass manager extension API.
+ 
+ -->
+  
 <ul>
 
 <!--
@@ -902,11 +930,17 @@ Builder.CreateResume(UnwindData);
    in.</p>
 
 <ul>
-<!--
-<li></li>
--->
+  <li>The ELF object streamers are much more full featured.</li>
+  <li>Target dependent relocation handling has been refactored into the Targets.</li>
+  <li>Early stage MC-JIT infrastructure has been implemented.</li>
 </ul>
 
+<p>The MC-JIT is a major new feature for MC, and will eventually grow to replace
+the current JIT implementation. It emits object files direct to memory and
+uses a runtime dynamic linker to resolve references and drive lazy compilation.
+The MC-JIT enables much greater code reuse between the JIT and the static
+compiler and provides better integration with the platform ABI as a result.</p>
+
 <p>For more information, please see
    the <a href="http://blog.llvm.org/2010/04/intro-to-llvm-mc-project.html">Intro
    to the LLVM MC Project Blog Post</a>.</p>
@@ -942,6 +976,10 @@ Builder.CreateResume(UnwindData);
 
 <ul>
 
+  <li>The X86 backend now supports
+    all <a href="http://llvm.org/PR879">inline assembly that uses the X86
+      floating point stack</a>.</li>
+  
   <li>The CRC32 intrinsics have been renamed.  The intrinsics were previously
       <code>@llvm.x86.sse42.crc32.[8|16|32]</code>
       and <code>@llvm.x86.sse42.crc64.[8|64]</code>. They have been renamed to
@@ -962,12 +1000,57 @@ Builder.CreateResume(UnwindData);
 <p>New features of the ARM target include:</p>
 
 <ul>
-<!--
-<li></li>
--->
+  <li>Reworked Set Jump Long Jump EH Lowering,</li>
+  <li>improved support for Cortex-M series processors, and</li>
+  <li>beta quality integrated assembler support.</li>
+</ul>
+</div>
+
+  
+<!--=========================================================================-->
+<h3>
+<a name="MIPS">MIPS Target Improvements</a>
+</h3>
+
+<div>
+
+<p>New features and major changes in the MIPS target include:</p>
+
+<ul>
+  <li>Most MIPS32r1 and r2 instructions are now supported.</li>
+  <li>LE/BE MIPS32r1/r2 has been tested extensively.</li>
+  <li>O32 ABI has been fully tested.</li>
+  <li>MIPS backend has migrated to using the MC infrastructure for assembly printing. Initial support for direct object code emission has been implemented too.</li>
+  <li>Delay slot filler has been updated. Now it tries to fill delay slots with useful instructions instead of always filling them with NOPs.</li> 
+  <li>Support for old-style JIT is complete.</li>
+  <li>Support for old architectures (MIPS1 and MIPS2) has been removed.</li>
+  <li>Initial support for MIPS64 has been added.</li>
 </ul>
 </div>
+
+<!--=========================================================================-->
+<h3>
+  <a name="PTX">PTX Target Improvements</a>
+</h3>
+
+<div>
+  
+  <p>
+  The PTX back-end is still experimental, but is fairly usable for compute kernels
+  in LLVM 3.0.  Most scalar arithmetic is implemented, as well as intrinsics to
+  access the special PTX registers and sync instructions.  The major missing
+  pieces are texture/sampler support and some vector operations.</p>
   
+  <p>That said, the backend is already being used for domain-specific languages
+  and works well with the <a href="http://www.pcc.me.uk/~peter/libclc/">libclc
+    library</a> to supply OpenCL built-ins.  With it, you can use Clang to compile
+  OpenCL code into PTX and execute it by loading the resulting PTX as a binary
+  blob using the nVidia OpenCL library.  It has been tested with several OpenCL
+  programs, including some from the nVidia GPU Computing SDK, and the performance
+  is on par with the nVidia compiler.</p>
+  
+</div>
+
 <!--=========================================================================-->
 <h3>
 <a name="OtherTS">Other Target Specific Improvements</a>
@@ -1191,28 +1274,9 @@ Builder.CreateResume(UnwindData);
 <div>
 
 <ul>
-  <li>The X86 backend does not yet support
-      all <a href="http://llvm.org/PR879">inline assembly that uses the X86
-      floating point stack</a>.  It supports the 'f' and 't' constraints, but
-      not 'u'.</li>
-
-  <li>The X86-64 backend does not yet support the LLVM IR instruction
-      <tt>va_arg</tt>. Currently, front-ends support variadic argument
-      constructs on X86-64 by lowering them manually.</li>
-
-  <li>Windows x64 (aka Win64) code generator has a few issues.
-    <ul>
-      <li>On mingw-w64, you will see unresolved symbol <tt>__chkstk</tt> due
-          to <a href="http://llvm.org/bugs/show_bug.cgi?id=8919">Bug 8919</a>.
-          It is fixed
-          in <a href="http://lists.cs.uiuc.edu/pipermail/llvm-commits/Week-of-Mon-20110321/118499.html">r128206</a>.</li>
-
-      <li>Miss-aligned MOVDQA might crash your program. It is due to
-          <a href="http://llvm.org/bugs/show_bug.cgi?id=9483">Bug 9483</a>, lack
-          of handling aligned internal globals.</li>
-      </ul>
-  </li>
-
+  <li>The X86-64 backend <a href="http://llvm.org/PR1740">does not yet support
+      the <tt>va_arg</tt> LLVM IR instruction</a>. Currently, front-ends support
+      variadic argument constructs on X86-64 by lowering them manually.</li>
 </ul>
 
 </div>
diff --git a/docs/SourceLevelDebugging.html b/docs/SourceLevelDebugging.html
index 6eaaa24..f2741a2 100644
--- a/docs/SourceLevelDebugging.html
+++ b/docs/SourceLevelDebugging.html
@@ -53,6 +53,19 @@
     <li><a href="#ccxx_composite_types">C/C++ struct/union types</a></li>
     <li><a href="#ccxx_enumeration_types">C/C++ enumeration types</a></li>
   </ol></li>
+  <li><a href="#llvmdwarfextension">LLVM Dwarf Extensions</a>
+    <ol>
+      <li><a href="#objcproperty">Debugging Information Extension 
+	  for Objective C Properties</a></li>
+      <ul>
+	<li><a href="#objcpropertyintroduction">Introduction</a></li>
+	<li><a href="#objcpropertyproposal">Proposal</a></li>
+	<li><a href="#objcpropertynewattributes">New DWARF Attributes</a></li>
+	<li><a href="#objcpropertynewconstants">New DWARF Constants</a></li>
+      </ul>
+
+    </ol>
+  </li>
 </ul>
 </td>
 <td class="right">
@@ -1803,6 +1816,217 @@ enum Trees {
 
 </div>
 
+
+<!-- *********************************************************************** -->
+<h2>
+  <a name="llvmdwarfextension">Debugging information format</a>
+</h2>
+<!-- *********************************************************************** -->
+<div>
+<!-- ======================================================================= -->
+<h3>
+  <a name="objcproperty">Debugging Information Extension for Objective C
+Properties</a></li>
+</h3>
+<div>
+<!-- *********************************************************************** -->
+<h4>
+  <a name="objcpropertyintroduction">Introduction</a>
+</h4>
+<!-- *********************************************************************** -->
+
+<div>
+<p>Objective C provides a simpler way to declare and define accessor methods 
+using declared properties. The language provides features to declare a 
+property and to let compiler synthesize accessor methods.
+</p>
+
+<p>The debugger lets developer inspect Objective C interfaces and their 
+instance variables and class variables. However, the debugger does not know 
+anything about the properties defined in Objective C interfaces. The debugger
+consumes information generated by compiler in DWARF format. The format does 
+not support encoding of Objective C properties. This proposal describes DWARF
+extensions to encode Objective C properties, which the debugger can use to let
+developers inspect Objective C properties.
+</p>
+
+</div>
+
+
+<!-- *********************************************************************** -->
+<h4>
+  <a name="objcpropertyproposal">Proposal</a>
+</h4>
+<!-- *********************************************************************** -->
+
+<div>
+<p>Objective C properties are always backed by an instance variable. The 
+instance variables backing properties are identified using 
+DW_AT_APPLE_property_name attribute. The instance variables with this 
+attribute may not have data location attributes. The location of instance 
+variables is determined by debugger only after consulting Objective C runtime.
+</p>
+
+<div class="doc_code">
+<pre>
+@interface I1 { 
+  int n2;
+} 
+
+@property p1; 
+@property p2; 
+@end
+
+@implementation I1 
+@synthesize p1; 
+@synthesize p2 = n2; 
+@end
+
+
+TAG_structure_type [7] * 
+  AT_APPLE_runtime_class( 0x10 )
+  AT_name( "I1" )
+  AT_decl_file( "Objc_Property.m" ) 
+  AT_decl_line( 3 )
+
+  TAG_member [8] 
+    AT_name( "p1" )
+    AT_APPLE_property_name(“p1”) 
+    AT_type( {0x00000147} ( int ) )
+
+  TAG_member [8] 
+    AT_name( "n2" )
+    AT_APPLE_property_name(“p2”) 
+    AT_type( {0x00000147} ( int ) )
+</pre>
+</div>
+
+<p> Developers can decorate a property with attributes which are encoded using 
+DW_AT_APPLE_property_attribute.
+</p>
+
+<div class="doc_code">
+<pre>
+@property (readonly, nonatomic) int pr;
+
+
+TAG_member [8] 
+  AT_name(“pr”) 
+  AT_APPLE_property_name(“pr”) 
+  AT_type ( {0x00000147} (int) ) 
+  AT_APPLE_property_attribute (DW_APPLE_PROPERTY_readonly, DW_APPLE_PROPERTY_nonatomic)
+</pre>
+</div>
+
+<p> The setter and getter method names are attached to the property using 
+DW_AT_APPLE_property_setter and DW_AT_APPLE_property_getter attributes.
+</p>
+<div class="doc_code">
+<pre>
+@interface I1 
+@property (setter=myOwnP3Setter:) int p3; 
+-(void)myOwnP3Setter:(int)a; 
+@end
+
+@implementation I1 
+@synthesize p3;
+-(void)myOwnP3Setter:(int)a{ } 
+@end
+
+0x000003bd: TAG_structure_type [7] * 
+              AT_APPLE_runtime_class( 0x10 )
+              AT_name( "I1" )
+              AT_decl_file( "Objc_Property.m" ) 
+              AT_decl_line( 3 )
+0x000003f3: TAG_member [8] 
+              AT_name( "p3" ) 
+              AT_APPLE_property_name(“p3”) 
+              AT_APPLE_property_setter(“myOwnP3Setter:”)
+              AT_type( {0x00000147} ( int ) )
+</pre>
+</div>
+
+</div>
+
+<!-- *********************************************************************** -->
+<h4>
+  <a name="objcpropertynewattributes">New DWARF Attributes</a>
+</h4>
+<!-- *********************************************************************** -->
+
+<div>
+<table border="1" cellspacing="0">
+  <tr>
+    <th width=200 >Attribute</th>
+    <th width=200 >Value</th>
+    <th width=200 >Classes</th>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_property_name</td>
+    <td width=200 >0x3fe8</td>
+    <td width=200 >String</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_property_getter</td>
+    <td width=200 >0x3fe9</td>
+    <td width=200 >String</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_property_setter</td>
+    <td width=200 >0x3fea</td>
+    <td width=200 >String</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_property_attribute</td>
+    <td width=200 >0x3feb</td>
+    <td width=200 >Constant</td>
+  </tr>
+</table>
+
+</div>
+
+<!-- *********************************************************************** -->
+<h4>
+  <a name="objcpropertynewconstants">New DWARF Constants</a>
+</h4>
+<!-- *********************************************************************** -->
+
+<div>
+<table border="1" cellspacing="0">
+  <tr>
+    <th width=200 >Name</th>
+    <th width=200 >Value</th>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_PROPERTY_readonly</td>
+    <td width=200 >0x1</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_PROPERTY_readwrite</td>
+    <td width=200 >0x2</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_PROPERTY_assign</td>
+    <td width=200 >0x4</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_PROPERTY_retain</td>
+    <td width=200 >0x8</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_PROPERTY_copy</td>
+    <td width=200 >0x10</td>
+  </tr>
+  <tr>
+    <td width=200 >DW_AT_APPLE_PROPERTY_nonatomic</td>
+    <td width=200 >0x20</td>
+  </tr>
+</table>
+
+</div>
+</div>
+</div>
+
 <!-- *********************************************************************** -->
 
 <hr>
diff --git a/docs/index.html b/docs/index.html
index 3e74a24..bfca45b 100644
--- a/docs/index.html
+++ b/docs/index.html
@@ -212,9 +212,6 @@ JITed code with GDB.</li>
 <li><a href="BranchWeightMetadata.html">Branch Weight Metadata</a> - Provides
 information about Branch Prediction Information.</li>
 
-<li><a href="ObjCPropertyDebugInfo.html">Objective C Property Debug Info</a> 
-- Debugging information extension for Objective C Property.</li>
-
 </ul>
 
 <!--=======================================================================-->
diff --git a/include/llvm/ADT/IntrusiveRefCntPtr.h b/include/llvm/ADT/IntrusiveRefCntPtr.h
index 2f6fd2b..8757f00 100644
--- a/include/llvm/ADT/IntrusiveRefCntPtr.h
+++ b/include/llvm/ADT/IntrusiveRefCntPtr.h
@@ -46,6 +46,7 @@ namespace llvm {
 
   public:
     RefCountedBase() : ref_cnt(0) {}
+    RefCountedBase(const RefCountedBase &) : ref_cnt(0) {}
 
     void Retain() const { ++ref_cnt; }
     void Release() const {
@@ -67,6 +68,8 @@ namespace llvm {
 
   protected:
     RefCountedBaseVPTR() : ref_cnt(0) {}
+    RefCountedBaseVPTR(const RefCountedBaseVPTR &) : ref_cnt(0) {}
+
     virtual ~RefCountedBaseVPTR() {}
 
     void Retain() const { ++ref_cnt; }
diff --git a/include/llvm/Analysis/BlockFrequencyImpl.h b/include/llvm/Analysis/BlockFrequencyImpl.h
index 0fb2bd7..a33cb1f 100644
--- a/include/llvm/Analysis/BlockFrequencyImpl.h
+++ b/include/llvm/Analysis/BlockFrequencyImpl.h
@@ -24,7 +24,6 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
 #include <vector>
-#include <sstream>
 #include <string>
 
 namespace llvm {
@@ -52,15 +51,16 @@ class BlockFrequencyImpl {
   const uint32_t EntryFreq;
 
   std::string getBlockName(BasicBlock *BB) const {
-    return BB->getNameStr();
+    return BB->getName().str();
   }
 
   std::string getBlockName(MachineBasicBlock *MBB) const {
-    std::stringstream ss;
+    std::string str;
+    raw_string_ostream ss(str);
     ss << "BB#" << MBB->getNumber();
 
     if (const BasicBlock *BB = MBB->getBasicBlock())
-      ss << " derived from LLVM BB " << BB->getNameStr();
+      ss << " derived from LLVM BB " << BB->getName();
 
     return ss.str();
   }
diff --git a/include/llvm/Analysis/CFGPrinter.h b/include/llvm/Analysis/CFGPrinter.h
index 223bdec..8f7ebcb 100644
--- a/include/llvm/Analysis/CFGPrinter.h
+++ b/include/llvm/Analysis/CFGPrinter.h
@@ -29,13 +29,13 @@ struct DOTGraphTraits<const Function*> : public DefaultDOTGraphTraits {
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
   static std::string getGraphName(const Function *F) {
-    return "CFG for '" + F->getNameStr() + "' function";
+    return "CFG for '" + F->getName().str() + "' function";
   }
 
   static std::string getSimpleNodeLabel(const BasicBlock *Node,
                                         const Function *) {
     if (!Node->getName().empty())
-      return Node->getNameStr(); 
+      return Node->getName().str();
 
     std::string Str;
     raw_string_ostream OS(Str);
diff --git a/include/llvm/Analysis/CaptureTracking.h b/include/llvm/Analysis/CaptureTracking.h
index 4461564..01eca60 100644
--- a/include/llvm/Analysis/CaptureTracking.h
+++ b/include/llvm/Analysis/CaptureTracking.h
@@ -33,132 +33,33 @@ namespace llvm {
                             bool ReturnCaptures,
                             bool StoreCaptures);
 
-  /// PointerMayBeCaptured - Visit the value and the values derived from it and
-  /// find values which appear to be capturing the pointer value. This feeds
-  /// results into and is controlled by the templated CaptureTracker object:
-  ///
-  /// struct YourCaptureTracker {
-  ///   /// tooManyUses - The depth of traversal has breached a limit.
-  ///   /// The tracker should conservatively assume that the value is captured.
-  ///   void tooManyUses();
-  ///
-  ///   /// shouldExplore - This is the use of a value derived from the pointer.
-  ///   /// Return false to prune the search (ie., assume that none of its users
-  ///   /// could possibly capture) return false. To search it, return true.
-  ///   ///
-  ///   /// Also, U->getUser() is guaranteed to be an Instruction.
-  ///   bool shouldExplore(Use *U);
-  ///
-  ///   /// captured - The instruction I captured the pointer. Return true to
-  ///   /// stop the traversal or false to continue looking for more capturing
-  ///   /// instructions.
-  ///   bool captured(Instruction *I);
-  ///
-  ///   /// Provide your own getters for the state.
-  /// };
-  template<typename CaptureTracker>
-  void PointerMayBeCaptured(const Value *V, CaptureTracker &Tracker);
-} // end namespace llvm
-
-template<typename CaptureTracker>
-void llvm::PointerMayBeCaptured(const llvm::Value *V, CaptureTracker &Tracker) {
-  assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
-  SmallVector<Use*, 20> Worklist;
-  SmallSet<Use*, 20> Visited;
-  int Count = 0;
-
-  for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
-       UI != UE; ++UI) {
-    // If there are lots of uses, conservatively say that the value
-    // is captured to avoid taking too much compile time.
-    if (Count++ >= 20)
-      return Tracker.tooManyUses();
+  /// This callback is used in conjunction with PointerMayBeCaptured. In
+  /// addition to the interface here, you'll need to provide your own getters
+  /// to see whether anything was captured.
+  struct CaptureTracker {
+    virtual ~CaptureTracker();
 
-    Use *U = &UI.getUse();
-    if (!Tracker.shouldExplore(U)) continue;
-    Visited.insert(U);
-    Worklist.push_back(U);
-  }
+    /// tooManyUses - The depth of traversal has breached a limit. There may be
+    /// capturing instructions that will not be passed into captured().
+    virtual void tooManyUses() = 0;
 
-  while (!Worklist.empty()) {
-    Use *U = Worklist.pop_back_val();
-    Instruction *I = cast<Instruction>(U->getUser());
-    V = U->get();
+    /// shouldExplore - This is the use of a value derived from the pointer.
+    /// To prune the search (ie., assume that none of its users could possibly
+    /// capture) return false. To search it, return true.
+    ///
+    /// U->getUser() is always an Instruction.
+    virtual bool shouldExplore(Use *U) = 0;
 
-    switch (I->getOpcode()) {
-    case Instruction::Call:
-    case Instruction::Invoke: {
-      CallSite CS(I);
-      // Not captured if the callee is readonly, doesn't return a copy through
-      // its return value and doesn't unwind (a readonly function can leak bits
-      // by throwing an exception or not depending on the input value).
-      if (CS.onlyReadsMemory() && CS.doesNotThrow() && I->getType()->isVoidTy())
-        break;
+    /// captured - The instruction I captured the pointer. Return true to
+    /// stop the traversal or false to continue looking for more capturing
+    /// instructions.
+    virtual bool captured(Instruction *I) = 0;
+  };
 
-      // Not captured if only passed via 'nocapture' arguments.  Note that
-      // calling a function pointer does not in itself cause the pointer to
-      // be captured.  This is a subtle point considering that (for example)
-      // the callee might return its own address.  It is analogous to saying
-      // that loading a value from a pointer does not cause the pointer to be
-      // captured, even though the loaded value might be the pointer itself
-      // (think of self-referential objects).
-      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
-      for (CallSite::arg_iterator A = B; A != E; ++A)
-        if (A->get() == V && !CS.paramHasAttr(A - B + 1, Attribute::NoCapture))
-          // The parameter is not marked 'nocapture' - captured.
-          if (Tracker.captured(I))
-            return;
-      break;
-    }
-    case Instruction::Load:
-      // Loading from a pointer does not cause it to be captured.
-      break;
-    case Instruction::VAArg:
-      // "va-arg" from a pointer does not cause it to be captured.
-      break;
-    case Instruction::Store:
-      if (V == I->getOperand(0))
-        // Stored the pointer - conservatively assume it may be captured.
-        if (Tracker.captured(I))
-          return;
-      // Storing to the pointee does not cause the pointer to be captured.
-      break;
-    case Instruction::BitCast:
-    case Instruction::GetElementPtr:
-    case Instruction::PHI:
-    case Instruction::Select:
-      // The original value is not captured via this if the new value isn't.
-      for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
-           UI != UE; ++UI) {
-        Use *U = &UI.getUse();
-        if (Visited.insert(U))
-          if (Tracker.shouldExplore(U))
-            Worklist.push_back(U);
-      }
-      break;
-    case Instruction::ICmp:
-      // Don't count comparisons of a no-alias return value against null as
-      // captures. This allows us to ignore comparisons of malloc results
-      // with null, for example.
-      if (isNoAliasCall(V->stripPointerCasts()))
-        if (ConstantPointerNull *CPN =
-              dyn_cast<ConstantPointerNull>(I->getOperand(1)))
-          if (CPN->getType()->getAddressSpace() == 0)
-            break;
-      // Otherwise, be conservative. There are crazy ways to capture pointers
-      // using comparisons.
-      if (Tracker.captured(I))
-        return;
-      break;
-    default:
-      // Something else - be conservative and say it is captured.
-      if (Tracker.captured(I))
-        return;
-      break;
-    }
-  }
-
-  // All uses examined.
-}
+  /// PointerMayBeCaptured - Visit the value and the values derived from it and
+  /// find values which appear to be capturing the pointer value. This feeds
+  /// results into and is controlled by the CaptureTracker object.
+  void PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker);
+} // end namespace llvm
 
 #endif
diff --git a/include/llvm/Analysis/DOTGraphTraitsPass.h b/include/llvm/Analysis/DOTGraphTraitsPass.h
index 30741c4..b701b8f 100644
--- a/include/llvm/Analysis/DOTGraphTraitsPass.h
+++ b/include/llvm/Analysis/DOTGraphTraitsPass.h
@@ -31,7 +31,7 @@ struct DOTGraphTraitsViewer : public FunctionPass {
     std::string Title, GraphName;
     Graph = &getAnalysis<Analysis>();
     GraphName = DOTGraphTraits<Analysis*>::getGraphName(Graph);
-    Title = GraphName + " for '" + F.getNameStr() + "' function";
+    Title = GraphName + " for '" + F.getName().str() + "' function";
     ViewGraph(Graph, Name, Simple, Title);
 
     return false;
@@ -55,7 +55,7 @@ struct DOTGraphTraitsPrinter : public FunctionPass {
 
   virtual bool runOnFunction(Function &F) {
     Analysis *Graph;
-    std::string Filename = Name + "." + F.getNameStr() + ".dot";
+    std::string Filename = Name + "." + F.getName().str() + ".dot";
     errs() << "Writing '" << Filename << "'...";
 
     std::string ErrorInfo;
@@ -64,7 +64,7 @@ struct DOTGraphTraitsPrinter : public FunctionPass {
 
     std::string Title, GraphName;
     GraphName = DOTGraphTraits<Analysis*>::getGraphName(Graph);
-    Title = GraphName + " for '" + F.getNameStr() + "' function";
+    Title = GraphName + " for '" + F.getName().str() + "' function";
 
     if (ErrorInfo.empty())
       WriteGraph(File, Graph, Simple, Title);
diff --git a/include/llvm/Analysis/DebugInfo.h b/include/llvm/Analysis/DebugInfo.h
index 09edfd7..c4489cf 100644
--- a/include/llvm/Analysis/DebugInfo.h
+++ b/include/llvm/Analysis/DebugInfo.h
@@ -135,8 +135,8 @@ namespace llvm {
   public:
     explicit DISubrange(const MDNode *N = 0) : DIDescriptor(N) {}
 
-    int64_t getLo() const { return (int64_t)getUInt64Field(1); }
-    int64_t getHi() const { return (int64_t)getUInt64Field(2); }
+    uint64_t getLo() const { return getUInt64Field(1); }
+    uint64_t getHi() const { return getUInt64Field(2); }
   };
 
   /// DIArray - This descriptor holds an array of descriptors.
diff --git a/include/llvm/Analysis/RegionInfo.h b/include/llvm/Analysis/RegionInfo.h
index 9d89545..b098eea 100644
--- a/include/llvm/Analysis/RegionInfo.h
+++ b/include/llvm/Analysis/RegionInfo.h
@@ -681,7 +681,7 @@ inline raw_ostream &operator<<(raw_ostream &OS, const RegionNode &Node) {
   if (Node.isSubRegion())
     return OS << Node.getNodeAs<Region>()->getNameStr();
   else
-    return OS << Node.getNodeAs<BasicBlock>()->getNameStr();
+    return OS << Node.getNodeAs<BasicBlock>()->getName();
 }
 } // End llvm namespace
 #endif
diff --git a/include/llvm/CodeGen/EdgeBundles.h b/include/llvm/CodeGen/EdgeBundles.h
index 8aab3c6..a1d29b1 100644
--- a/include/llvm/CodeGen/EdgeBundles.h
+++ b/include/llvm/CodeGen/EdgeBundles.h
@@ -18,6 +18,7 @@
 
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/IntEqClasses.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
 
 namespace llvm {
@@ -61,7 +62,7 @@ private:
 /// Specialize WriteGraph, the standard implementation won't work.
 raw_ostream &WriteGraph(raw_ostream &O, const EdgeBundles &G,
                         bool ShortNames = false,
-                        const std::string &Title = "");
+                        const Twine &Title = "");
 
 } // end namespace llvm
 
diff --git a/include/llvm/CodeGen/MachineFunctionAnalysis.h b/include/llvm/CodeGen/MachineFunctionAnalysis.h
index 50676ad..50ea206 100644
--- a/include/llvm/CodeGen/MachineFunctionAnalysis.h
+++ b/include/llvm/CodeGen/MachineFunctionAnalysis.h
@@ -26,17 +26,14 @@ class MachineFunction;
 struct MachineFunctionAnalysis : public FunctionPass {
 private:
   const TargetMachine &TM;
-  CodeGenOpt::Level OptLevel;
   MachineFunction *MF;
   unsigned NextFnNum;
 public:
   static char ID;
-  explicit MachineFunctionAnalysis(const TargetMachine &tm,
-                                   CodeGenOpt::Level OL = CodeGenOpt::Default);
+  explicit MachineFunctionAnalysis(const TargetMachine &tm);
   ~MachineFunctionAnalysis();
 
   MachineFunction &getMF() const { return *MF; }
-  CodeGenOpt::Level getOptLevel() const { return OptLevel; }
   
   virtual const char* getPassName() const {
     return "Machine Function Analysis";
diff --git a/include/llvm/CodeGen/ValueTypes.h b/include/llvm/CodeGen/ValueTypes.h
index cae0bcb..7a7080f 100644
--- a/include/llvm/CodeGen/ValueTypes.h
+++ b/include/llvm/CodeGen/ValueTypes.h
@@ -83,7 +83,7 @@ namespace llvm {
 
       isVoid         =  35,   // This has no value
 
-      untyped        =  36,   // This value takes a register, but has
+      Untyped        =  36,   // This value takes a register, but has
                               // unspecified type.  The register class
                               // will be determined by the opcode.
 
diff --git a/include/llvm/Config/config.h.cmake b/include/llvm/Config/config.h.cmake
index 69e3eef..11e5b5a 100644
--- a/include/llvm/Config/config.h.cmake
+++ b/include/llvm/Config/config.h.cmake
@@ -297,6 +297,9 @@
 /* Define to 1 if you have the `powf' function. */
 #cmakedefine HAVE_POWF ${HAVE_POWF}
 
+/* Define to 1 if you have the `pread' function. */
+#cmakedefine HAVE_PREAD ${HAVE_PREAD}
+
 /* Define if libtool can extract symbol lists from object files. */
 #undef HAVE_PRELOADED_SYMBOLS
 
diff --git a/include/llvm/Config/config.h.in b/include/llvm/Config/config.h.in
index 813c6eb..b3a9590 100644
--- a/include/llvm/Config/config.h.in
+++ b/include/llvm/Config/config.h.in
@@ -295,6 +295,9 @@
 /* Define to 1 if you have the `powf' function. */
 #undef HAVE_POWF
 
+/* Define to 1 if you have the `pread' function. */
+#undef HAVE_PREAD
+
 /* Define if libtool can extract symbol lists from object files. */
 #undef HAVE_PRELOADED_SYMBOLS
 
diff --git a/include/llvm/InitializePasses.h b/include/llvm/InitializePasses.h
index a917d2e..ceb1771 100644
--- a/include/llvm/InitializePasses.h
+++ b/include/llvm/InitializePasses.h
@@ -96,6 +96,7 @@ void initializeEdgeBundlesPass(PassRegistry&);
 void initializeEdgeProfilerPass(PassRegistry&);
 void initializePathProfilerPass(PassRegistry&);
 void initializeGCOVProfilerPass(PassRegistry&);
+void initializeAddressSanitizerPass(PassRegistry&);
 void initializeEarlyCSEPass(PassRegistry&);
 void initializeExpandISelPseudosPass(PassRegistry&);
 void initializeFindUsedTypesPass(PassRegistry&);
diff --git a/include/llvm/MC/MCCodeGenInfo.h b/include/llvm/MC/MCCodeGenInfo.h
index 1c54c47..e40a052 100644
--- a/include/llvm/MC/MCCodeGenInfo.h
+++ b/include/llvm/MC/MCCodeGenInfo.h
@@ -28,13 +28,20 @@ namespace llvm {
     ///
     CodeModel::Model CMModel;
 
+    /// OptLevel - Optimization level.
+    ///
+    CodeGenOpt::Level OptLevel;
+
   public:
     void InitMCCodeGenInfo(Reloc::Model RM = Reloc::Default,
-                           CodeModel::Model CM = CodeModel::Default);
+                           CodeModel::Model CM = CodeModel::Default,
+                           CodeGenOpt::Level OL = CodeGenOpt::Default);
 
     Reloc::Model getRelocationModel() const { return RelocationModel; }
 
     CodeModel::Model getCodeModel() const { return CMModel; }
+
+    CodeGenOpt::Level getOptLevel() const { return OptLevel; }
   };
 } // namespace llvm
 
diff --git a/include/llvm/MC/MCFixup.h b/include/llvm/MC/MCFixup.h
index 6fde797..7404270 100644
--- a/include/llvm/MC/MCFixup.h
+++ b/include/llvm/MC/MCFixup.h
@@ -26,6 +26,10 @@ enum MCFixupKind {
   FK_PCRel_2,    ///< A two-byte pc relative fixup.
   FK_PCRel_4,    ///< A four-byte pc relative fixup.
   FK_PCRel_8,    ///< A eight-byte pc relative fixup.
+  FK_GPRel_1,    ///< A one-byte gp relative fixup.
+  FK_GPRel_2,    ///< A two-byte gp relative fixup.
+  FK_GPRel_4,    ///< A four-byte gp relative fixup.
+  FK_GPRel_8,    ///< A eight-byte gp relative fixup.
 
   FirstTargetFixupKind = 128,
 
diff --git a/include/llvm/MC/MCObjectStreamer.h b/include/llvm/MC/MCObjectStreamer.h
index f897e64..01d254a 100644
--- a/include/llvm/MC/MCObjectStreamer.h
+++ b/include/llvm/MC/MCObjectStreamer.h
@@ -77,6 +77,7 @@ public:
                                         unsigned PointerSize);
   virtual void EmitDwarfAdvanceFrameAddr(const MCSymbol *LastLabel,
                                          const MCSymbol *Label);
+  virtual void EmitGPRel32Value(const MCExpr *Value);
   virtual void Finish();
 
   /// @}
diff --git a/include/llvm/Object/Archive.h b/include/llvm/Object/Archive.h
index b12b17e..9e1369a 100644
--- a/include/llvm/Object/Archive.h
+++ b/include/llvm/Object/Archive.h
@@ -34,6 +34,10 @@ public:
       return (Parent == other.Parent) && (Data.begin() == other.Data.begin());
     }
 
+    bool operator <(const Child &other) const {
+      return Data.begin() < other.Data.begin();
+    }
+
     Child getNext() const;
     error_code getName(StringRef &Result) const;
     int getLastModified() const;
@@ -64,6 +68,10 @@ public:
       return !(*this == other);
     }
 
+    bool operator <(const child_iterator &other) const {
+      return child < other.child;
+    }
+
     child_iterator& operator++() {  // Preincrement
       child = child.getNext();
       return *this;
diff --git a/include/llvm/Pass.h b/include/llvm/Pass.h
index 04dd8b6..4fd4b2c 100644
--- a/include/llvm/Pass.h
+++ b/include/llvm/Pass.h
@@ -86,7 +86,7 @@ class Pass {
   Pass(const Pass &);           // DO NOT IMPLEMENT
   
 public:
-  explicit Pass(PassKind K, char &pid);
+  explicit Pass(PassKind K, char &pid) : Resolver(0), PassID(&pid), Kind(K) { }
   virtual ~Pass();
 
   
@@ -99,7 +99,7 @@ public:
   virtual const char *getPassName() const;
 
   /// getPassID - Return the PassID number that corresponds to this pass.
-  virtual AnalysisID getPassID() const {
+  AnalysisID getPassID() const {
     return PassID;
   }
 
diff --git a/include/llvm/Support/BranchProbability.h b/include/llvm/Support/BranchProbability.h
index 4b5d904..eedf692 100644
--- a/include/llvm/Support/BranchProbability.h
+++ b/include/llvm/Support/BranchProbability.h
@@ -42,7 +42,7 @@ public:
   uint32_t getDenominator() const { return D; }
 
   // Return (1 - Probability).
-  BranchProbability getCompl() {
+  BranchProbability getCompl() const {
     return BranchProbability(D - N, D);
   }
 
diff --git a/include/llvm/Support/COFF.h b/include/llvm/Support/COFF.h
index 6739255..88c60ba 100644
--- a/include/llvm/Support/COFF.h
+++ b/include/llvm/Support/COFF.h
@@ -24,6 +24,7 @@
 #define LLVM_SUPPORT_WIN_COFF_H
 
 #include "llvm/Support/DataTypes.h"
+#include <cassert>
 #include <cstring>
 
 namespace llvm {
@@ -49,8 +50,65 @@ namespace COFF {
   };
 
   enum MachineTypes {
-    IMAGE_FILE_MACHINE_I386 = 0x14C,
-    IMAGE_FILE_MACHINE_AMD64 = 0x8664
+    IMAGE_FILE_MACHINE_UNKNOWN   = 0x0,
+    IMAGE_FILE_MACHINE_AM33      = 0x13,
+    IMAGE_FILE_MACHINE_AMD64     = 0x8664,
+    IMAGE_FILE_MACHINE_ARM       = 0x1C0,
+    IMAGE_FILE_MACHINE_ARMV7     = 0x1C4,
+    IMAGE_FILE_MACHINE_EBC       = 0xEBC,
+    IMAGE_FILE_MACHINE_I386      = 0x14C,
+    IMAGE_FILE_MACHINE_IA64      = 0x200,
+    IMAGE_FILE_MACHINE_M32R      = 0x9041,
+    IMAGE_FILE_MACHINE_MIPS16    = 0x266,
+    IMAGE_FILE_MACHINE_MIPSFPU   = 0x366,
+    IMAGE_FILE_MACHINE_MIPSFPU16 = 0x466,
+    IMAGE_FILE_MACHINE_POWERPC   = 0x1F0,
+    IMAGE_FILE_MACHINE_POWERPCFP = 0x1F1,
+    IMAGE_FILE_MACHINE_R4000     = 0x166,
+    IMAGE_FILE_MACHINE_SH3       = 0x1A2,
+    IMAGE_FILE_MACHINE_SH3DSP    = 0x1A3,
+    IMAGE_FILE_MACHINE_SH4       = 0x1A6,
+    IMAGE_FILE_MACHINE_SH5       = 0x1A8,
+    IMAGE_FILE_MACHINE_THUMB     = 0x1C2,
+    IMAGE_FILE_MACHINE_WCEMIPSV2 = 0x169
+  };
+
+  enum Characteristics {
+    /// The file does not contain base relocations and must be loaded at its
+    /// preferred base. If this cannot be done, the loader will error.
+    IMAGE_FILE_RELOCS_STRIPPED         = 0x0001,
+    /// The file is valid and can be run.
+    IMAGE_FILE_EXECUTABLE_IMAGE        = 0x0002,
+    /// COFF line numbers have been stripped. This is deprecated and should be
+    /// 0.
+    IMAGE_FILE_LINE_NUMS_STRIPPED      = 0x0004,
+    /// COFF symbol table entries for local symbols have been removed. This is
+    /// deprecated and should be 0.
+    IMAGE_FILE_LOCAL_SYMS_STRIPPED     = 0x0008,
+    /// Aggressively trim working set. This is deprecated and must be 0.
+    IMAGE_FILE_AGGRESSIVE_WS_TRIM      = 0x0010,
+    /// Image can handle > 2GiB addresses.
+    IMAGE_FILE_LARGE_ADDRESS_AWARE     = 0x0020,
+    /// Little endian: the LSB precedes the MSB in memory. This is deprecated
+    /// and should be 0.
+    IMAGE_FILE_BYTES_REVERSED_LO       = 0x0080,
+    /// Machine is based on a 32bit word architecture.
+    IMAGE_FILE_32BIT_MACHINE           = 0x0100,
+    /// Debugging info has been removed.
+    IMAGE_FILE_DEBUG_STRIPPED          = 0x0200,
+    /// If the image is on removable media, fully load it and copy it to swap.
+    IMAGE_FILE_REMOVABLE_RUN_FROM_SWAP = 0x0400,
+    /// If the image is on network media, fully load it and copy it to swap.
+    IMAGE_FILE_NET_RUN_FROM_SWAP       = 0x0800,
+    /// The image file is a system file, not a user program.
+    IMAGE_FILE_SYSTEM                  = 0x1000,
+    /// The image file is a DLL.
+    IMAGE_FILE_DLL                     = 0x2000,
+    /// This file should only be run on a uniprocessor machine.
+    IMAGE_FILE_UP_SYSTEM_ONLY          = 0x4000,
+    /// Big endian: the MSB precedes the LSB in memory. This is deprecated
+    /// and should be 0.
+    IMAGE_FILE_BYTES_REVERSED_HI       = 0x8000
   };
 
   struct symbol {
@@ -231,6 +289,24 @@ namespace COFF {
     IMAGE_REL_AMD64_SSPAN32   = 0x0010
   };
 
+  enum RelocationTypesARM {
+    IMAGE_REL_ARM_ABSOLUTE  = 0x0000,
+    IMAGE_REL_ARM_ADDR32    = 0x0001,
+    IMAGE_REL_ARM_ADDR32NB  = 0x0002,
+    IMAGE_REL_ARM_BRANCH24  = 0x0003,
+    IMAGE_REL_ARM_BRANCH11  = 0x0004,
+    IMAGE_REL_ARM_TOKEN     = 0x0005,
+    IMAGE_REL_ARM_BLX24     = 0x0008,
+    IMAGE_REL_ARM_BLX11     = 0x0009,
+    IMAGE_REL_ARM_SECTION   = 0x000E,
+    IMAGE_REL_ARM_SECREL    = 0x000F,
+    IMAGE_REL_ARM_MOV32A    = 0x0010,
+    IMAGE_REL_ARM_MOV32T    = 0x0011,
+    IMAGE_REL_ARM_BRANCH20T = 0x0012,
+    IMAGE_REL_ARM_BRANCH24T = 0x0014,
+    IMAGE_REL_ARM_BLX23T    = 0x0015
+  };
+
   enum COMDATType {
     IMAGE_COMDAT_SELECT_NODUPLICATES = 1,
     IMAGE_COMDAT_SELECT_ANY,
@@ -292,7 +368,219 @@ namespace COFF {
     AuxiliarySectionDefinition  SectionDefinition;
   };
 
-} // End namespace llvm.
+  /// @brief The Import Directory Table.
+  ///
+  /// There is a single array of these and one entry per imported DLL.
+  struct ImportDirectoryTableEntry {
+    uint32_t ImportLookupTableRVA;
+    uint32_t TimeDateStamp;
+    uint32_t ForwarderChain;
+    uint32_t NameRVA;
+    uint32_t ImportAddressTableRVA;
+  };
+
+  /// @brief The PE32 Import Lookup Table.
+  ///
+  /// There is an array of these for each imported DLL. It represents either
+  /// the ordinal to import from the target DLL, or a name to lookup and import
+  /// from the target DLL.
+  ///
+  /// This also happens to be the same format used by the Import Address Table
+  /// when it is initially written out to the image.
+  struct ImportLookupTableEntry32 {
+    uint32_t data;
+
+    /// @brief Is this entry specified by ordinal, or name?
+    bool isOrdinal() const { return data & 0x80000000; }
+
+    /// @brief Get the ordinal value of this entry. isOrdinal must be true.
+    uint16_t getOrdinal() const {
+      assert(isOrdinal() && "ILT entry is not an ordinal!");
+      return data & 0xFFFF;
+    }
+
+    /// @brief Set the ordinal value and set isOrdinal to true.
+    void setOrdinal(uint16_t o) {
+      data = o;
+      data |= 0x80000000;
+    }
+
+    /// @brief Get the Hint/Name entry RVA. isOrdinal must be false.
+    uint32_t getHintNameRVA() const {
+      assert(!isOrdinal() && "ILT entry is not a Hint/Name RVA!");
+      return data;
+    }
+
+    /// @brief Set the Hint/Name entry RVA and set isOrdinal to false.
+    void setHintNameRVA(uint32_t rva) { data = rva; }
+  };
+
+  /// @brief The DOS compatible header at the front of all PEs.
+  struct DOSHeader {
+    uint16_t Magic;
+    uint16_t UsedBytesInTheLastPage;
+    uint16_t FileSizeInPages;
+    uint16_t NumberOfRelocationItems;
+    uint16_t HeaderSizeInParagraphs;
+    uint16_t MinimumExtraParagraphs;
+    uint16_t MaximumExtraParagraphs;
+    uint16_t InitialRelativeSS;
+    uint16_t InitialSP;
+    uint16_t Checksum;
+    uint16_t InitialIP;
+    uint16_t InitialRelativeCS;
+    uint16_t AddressOfRelocationTable;
+    uint16_t OverlayNumber;
+    uint16_t Reserved[4];
+    uint16_t OEMid;
+    uint16_t OEMinfo;
+    uint16_t Reserved2[10];
+    uint32_t AddressOfNewExeHeader;
+  };
+
+  struct PEHeader {
+    uint32_t Signature;
+    header COFFHeader;
+    uint16_t Magic;
+    uint8_t  MajorLinkerVersion;
+    uint8_t  MinorLinkerVersion;
+    uint32_t SizeOfCode;
+    uint32_t SizeOfInitializedData;
+    uint32_t SizeOfUninitializedData;
+    uint32_t AddressOfEntryPoint; // RVA
+    uint32_t BaseOfCode; // RVA
+    uint32_t BaseOfData; // RVA
+    uint64_t ImageBase;
+    uint32_t SectionAlignment;
+    uint32_t FileAlignment;
+    uint16_t MajorOperatingSystemVersion;
+    uint16_t MinorOperatingSystemVersion;
+    uint16_t MajorImageVersion;
+    uint16_t MinorImageVersion;
+    uint16_t MajorSubsystemVersion;
+    uint16_t MinorSubsystemVersion;
+    uint32_t Win32VersionValue;
+    uint32_t SizeOfImage;
+    uint32_t SizeOfHeaders;
+    uint32_t CheckSum;
+    uint16_t Subsystem;
+    uint16_t DLLCharacteristics;
+    uint64_t SizeOfStackReserve;
+    uint64_t SizeOfStackCommit;
+    uint64_t SizeOfHeapReserve;
+    uint64_t SizeOfHeapCommit;
+    uint32_t LoaderFlags;
+    uint32_t NumberOfRvaAndSize;
+  };
+
+  struct DataDirectory {
+    uint32_t RelativeVirtualAddress;
+    uint32_t Size;
+  };
+
+  enum WindowsSubsystem {
+    IMAGE_SUBSYSTEM_UNKNOWN = 0, ///< An unknown subsystem.
+    IMAGE_SUBSYSTEM_NATIVE = 1, ///< Device drivers and native Windows processes
+    IMAGE_SUBSYSTEM_WINDOWS_GUI = 2, ///< The Windows GUI subsystem.
+    IMAGE_SUBSYSTEM_WINDOWS_CUI = 3, ///< The Windows character subsystem.
+    IMAGE_SUBSYSTEM_POSIX_CUI = 7, ///< The POSIX character subsystem.
+    IMAGE_SUBSYSTEM_WINDOWS_CE_GUI = 9, ///< Windows CE.
+    IMAGE_SUBSYSTEM_EFI_APPLICATION = 10, ///< An EFI application.
+    IMAGE_SUBSYSTEM_EFI_BOOT_SERVICE_DRIVER = 11, ///< An EFI driver with boot
+                                                  ///  services.
+    IMAGE_SUBSYSTEM_EFI_RUNTIME_DRIVER = 12, ///< An EFI driver with run-time
+                                             ///  services.
+    IMAGE_SUBSYSTEM_EFI_ROM = 13, ///< An EFI ROM image.
+    IMAGE_SUBSYSTEM_XBOX = 14 ///< XBOX.
+  };
+
+  enum DLLCharacteristics {
+    /// DLL can be relocated at load time.
+    IMAGE_DLL_CHARACTERISTICS_DYNAMIC_BASE = 0x0040,
+    /// Code integrity checks are enforced.
+    IMAGE_DLL_CHARACTERISTICS_FORCE_INTEGRITY = 0x0080,
+    IMAGE_DLL_CHARACTERISTICS_NX_COMPAT = 0x0100, ///< Image is NX compatible.
+    /// Isolation aware, but do not isolate the image.
+    IMAGE_DLL_CHARACTERISTICS_NO_ISOLATION = 0x0200,
+    /// Does not use structured exception handling (SEH). No SEH handler may be
+    /// called in this image.
+    IMAGE_DLL_CHARACTERISTICS_NO_SEH = 0x0400,
+    /// Do not bind the image.
+    IMAGE_DLL_CHARACTERISTICS_NO_BIND = 0x0800,
+    IMAGE_DLL_CHARACTERISTICS_WDM_DRIVER = 0x2000, ///< A WDM driver.
+    /// Terminal Server aware.
+    IMAGE_DLL_CHARACTERISTICS_TERMINAL_SERVER_AWARE = 0x8000
+  };
+
+  enum DebugType {
+    IMAGE_DEBUG_TYPE_UNKNOWN       = 0,
+    IMAGE_DEBUG_TYPE_COFF          = 1,
+    IMAGE_DEBUG_TYPE_CODEVIEW      = 2,
+    IMAGE_DEBUG_TYPE_FPO           = 3,
+    IMAGE_DEBUG_TYPE_MISC          = 4,
+    IMAGE_DEBUG_TYPE_EXCEPTION     = 5,
+    IMAGE_DEBUG_TYPE_FIXUP         = 6,
+    IMAGE_DEBUG_TYPE_OMAP_TO_SRC   = 7,
+    IMAGE_DEBUG_TYPE_OMAP_FROM_SRC = 8,
+    IMAGE_DEBUG_TYPE_BORLAND       = 9,
+    IMAGE_DEBUG_TYPE_CLSID         = 11
+  };
+
+  enum BaseRelocationType {
+    IMAGE_REL_BASED_ABSOLUTE       = 0,
+    IMAGE_REL_BASED_HIGH           = 1,
+    IMAGE_REL_BASED_LOW            = 2,
+    IMAGE_REL_BASED_HIGHLOW        = 3,
+    IMAGE_REL_BASED_HIGHADJ        = 4,
+    IMAGE_REL_BASED_MIPS_JMPADDR   = 5,
+    IMAGE_REL_BASED_ARM_MOV32A     = 5,
+    IMAGE_REL_BASED_ARM_MOV32T     = 7,
+    IMAGE_REL_BASED_MIPS_JMPADDR16 = 9,
+    IMAGE_REL_BASED_DIR64          = 10
+  };
+
+  enum ImportType {
+    IMPORT_CODE  = 0,
+    IMPORT_DATA  = 1,
+    IMPORT_CONST = 2
+  };
+
+  enum ImportNameType {
+    /// Import is by ordinal. This indicates that the value in the Ordinal/Hint
+    /// field of the import header is the import's ordinal. If this constant is
+    /// not specified, then the Ordinal/Hint field should always be interpreted
+    /// as the import's hint.
+    IMPORT_ORDINAL         = 0,
+    /// The import name is identical to the public symbol name
+    IMPORT_NAME            = 1,
+    /// The import name is the public symbol name, but skipping the leading ?,
+    /// @, or optionally _.
+    IMPORT_NAME_NOPREFIX   = 2,
+    /// The import name is the public symbol name, but skipping the leading ?,
+    /// @, or optionally _, and truncating at the first @.
+    IMPORT_NAME_UNDECORATE = 3
+  };
+
+  struct ImportHeader {
+    uint16_t Sig1; ///< Must be IMAGE_FILE_MACHINE_UNKNOWN (0).
+    uint16_t Sig2; ///< Must be 0xFFFF.
+    uint16_t Version;
+    uint16_t Machine;
+    uint32_t TimeDateStamp;
+    uint32_t SizeOfData;
+    uint16_t OrdinalHint;
+    uint16_t TypeInfo;
+
+    ImportType getType() const {
+      return static_cast<ImportType>(TypeInfo & 0x3);
+    }
+
+    ImportNameType getNameType() const {
+      return static_cast<ImportNameType>((TypeInfo & 0x1C) >> 3);
+    }
+  };
+
 } // End namespace COFF.
+} // End namespace llvm.
 
 #endif
diff --git a/include/llvm/Support/CallSite.h b/include/llvm/Support/CallSite.h
index 04b8c4e..20634ed 100644
--- a/include/llvm/Support/CallSite.h
+++ b/include/llvm/Support/CallSite.h
@@ -237,6 +237,16 @@ public:
 #undef CALLSITE_DELEGATE_GETTER
 #undef CALLSITE_DELEGATE_SETTER
 
+  /// @brief Determine whether this argument is not captured.
+  bool doesNotCapture(unsigned ArgNo) const {
+    return paramHasAttr(ArgNo + 1, Attribute::NoCapture);
+  }
+
+  /// @brief Determine whether this argument is passed by value.
+  bool isByValArgument(unsigned ArgNo) const {
+    return paramHasAttr(ArgNo + 1, Attribute::ByVal);
+  }
+
   /// hasArgument - Returns true if this CallSite passes the given Value* as an
   /// argument to the called function.
   bool hasArgument(const Value *Arg) const {
diff --git a/include/llvm/Support/CodeGen.h b/include/llvm/Support/CodeGen.h
index 41351dc..3a76cc7 100644
--- a/include/llvm/Support/CodeGen.h
+++ b/include/llvm/Support/CodeGen.h
@@ -27,6 +27,16 @@ namespace llvm {
     enum Model { Default, JITDefault, Small, Kernel, Medium, Large };
   }
 
+  // Code generation optimization level.
+  namespace CodeGenOpt {
+    enum Level {
+      None,        // -O0
+      Less,        // -O1
+      Default,     // -O2, -Os
+      Aggressive   // -O3
+    };
+  }
+
 }  // end llvm namespace
 
 #endif
diff --git a/include/llvm/Support/Compiler.h b/include/llvm/Support/Compiler.h
index 14b55c1..9c5e7ec 100644
--- a/include/llvm/Support/Compiler.h
+++ b/include/llvm/Support/Compiler.h
@@ -61,7 +61,7 @@
 #define LLVM_ATTRIBUTE_READONLY
 #endif
 
-#if (__GNUC__ >= 4)
+#if (__GNUC__ >= 4) && !defined(__MINGW32__) && !defined(__CYGWIN__)
 #define LLVM_ATTRIBUTE_WEAK __attribute__((__weak__))
 #else
 #define LLVM_ATTRIBUTE_WEAK
diff --git a/include/llvm/Support/ELF.h b/include/llvm/Support/ELF.h
index c5b85e2..a3906b1 100644
--- a/include/llvm/Support/ELF.h
+++ b/include/llvm/Support/ELF.h
@@ -599,7 +599,23 @@ enum {
   R_ARM_THM_TLS_DESCSEQ32     = 0x82
 };
 
+// Mips Specific e_flags
+enum {
+  EF_MIPS_NOREORDER = 0x00000001, // Don't reorder instructions
+  EF_MIPS_PIC       = 0x00000002, // Position independent code
+  EF_MIPS_CPIC      = 0x00000004, // Call object with Position independent code
+  EF_MIPS_ARCH_1    = 0x00000000, // MIPS1 instruction set
+  EF_MIPS_ARCH_2    = 0x10000000, // MIPS2 instruction set
+  EF_MIPS_ARCH_3    = 0x20000000, // MIPS3 instruction set
+  EF_MIPS_ARCH_4    = 0x30000000, // MIPS4 instruction set
+  EF_MIPS_ARCH_5    = 0x40000000, // MIPS5 instruction set
+  EF_MIPS_ARCH_32   = 0x60000000, // MIPS32 instruction set
+  EF_MIPS_ARCH_32R2 = 0x70000000, // mips32r2
+  EF_MIPS_ARCH      = 0xf0000000  // Mask for applying EF_MIPS_ARCH_ variant
+};
+
 // ELF Relocation types for Mips
+// .
 enum {
   R_MIPS_NONE              =  0,
   R_MIPS_16                =  1,
diff --git a/include/llvm/Support/GraphWriter.h b/include/llvm/Support/GraphWriter.h
index eab0c9d..ae32da5 100644
--- a/include/llvm/Support/GraphWriter.h
+++ b/include/llvm/Support/GraphWriter.h
@@ -296,26 +296,26 @@ public:
 template<typename GraphType>
 raw_ostream &WriteGraph(raw_ostream &O, const GraphType &G,
                         bool ShortNames = false,
-                        const std::string &Title = "") {
+                        const Twine &Title = "") {
   // Start the graph emission process...
   GraphWriter<GraphType> W(O, G, ShortNames);
 
   // Emit the graph.
-  W.writeGraph(Title);
+  W.writeGraph(Title.str());
 
   return O;
 }
 
 template<typename GraphType>
-sys::Path WriteGraph(const GraphType &G, const std::string &Name,
-                     bool ShortNames = false, const std::string &Title = "") {
+sys::Path WriteGraph(const GraphType &G, const Twine &Name,
+                     bool ShortNames = false, const Twine &Title = "") {
   std::string ErrMsg;
   sys::Path Filename = sys::Path::GetTemporaryDirectory(&ErrMsg);
   if (Filename.isEmpty()) {
     errs() << "Error: " << ErrMsg << "\n";
     return Filename;
   }
-  Filename.appendComponent(Name + ".dot");
+  Filename.appendComponent((Name + ".dot").str());
   if (Filename.makeUnique(true,&ErrMsg)) {
     errs() << "Error: " << ErrMsg << "\n";
     return sys::Path();
@@ -341,8 +341,8 @@ sys::Path WriteGraph(const GraphType &G, const std::string &Name,
 /// then cleanup.  For use from the debugger.
 ///
 template<typename GraphType>
-void ViewGraph(const GraphType &G, const std::string &Name,
-               bool ShortNames = false, const std::string &Title = "",
+void ViewGraph(const GraphType &G, const Twine &Name,
+               bool ShortNames = false, const Twine &Title = "",
                GraphProgram::Name Program = GraphProgram::DOT) {
   sys::Path Filename = llvm::WriteGraph(G, Name, ShortNames, Title);
 
diff --git a/include/llvm/Support/Host.h b/include/llvm/Support/Host.h
index d52f631..b331016 100644
--- a/include/llvm/Support/Host.h
+++ b/include/llvm/Support/Host.h
@@ -33,8 +33,8 @@ namespace sys {
     return !isLittleEndianHost();
   }
 
-  /// getDefaultTargetTriple() - Return the target triple of the running
-  /// system.
+  /// getDefaultTargetTriple() - Return the default target triple the compiler
+  /// has been configured to produce code for.
   ///
   /// The target triple is a string in the format of:
   ///   CPU_TYPE-VENDOR-OPERATING_SYSTEM
diff --git a/include/llvm/Support/TargetRegistry.h b/include/llvm/Support/TargetRegistry.h
index bf3b7ba..e1ef39e 100644
--- a/include/llvm/Support/TargetRegistry.h
+++ b/include/llvm/Support/TargetRegistry.h
@@ -74,7 +74,8 @@ namespace llvm {
                                             StringRef TT);
     typedef MCCodeGenInfo *(*MCCodeGenInfoCtorFnTy)(StringRef TT,
                                                     Reloc::Model RM,
-                                                    CodeModel::Model CM);
+                                                    CodeModel::Model CM,
+                                                    CodeGenOpt::Level OL);
     typedef MCInstrInfo *(*MCInstrInfoCtorFnTy)(void);
     typedef MCInstrAnalysis *(*MCInstrAnalysisCtorFnTy)(const MCInstrInfo*Info);
     typedef MCRegisterInfo *(*MCRegInfoCtorFnTy)(StringRef TT);
@@ -86,7 +87,8 @@ namespace llvm {
                                                   StringRef CPU,
                                                   StringRef Features,
                                                   Reloc::Model RM,
-                                                  CodeModel::Model CM);
+                                                  CodeModel::Model CM,
+                                                  CodeGenOpt::Level OL);
     typedef AsmPrinter *(*AsmPrinterCtorTy)(TargetMachine &TM,
                                             MCStreamer &Streamer);
     typedef MCAsmBackend *(*MCAsmBackendCtorTy)(const Target &T, StringRef TT);
@@ -145,8 +147,8 @@ namespace llvm {
     /// registered.
     MCAsmInfoCtorFnTy MCAsmInfoCtorFn;
 
-    /// MCCodeGenInfoCtorFn - Constructor function for this target's MCCodeGenInfo,
-    /// if registered.
+    /// MCCodeGenInfoCtorFn - Constructor function for this target's
+    /// MCCodeGenInfo, if registered.
     MCCodeGenInfoCtorFnTy MCCodeGenInfoCtorFn;
 
     /// MCInstrInfoCtorFn - Constructor function for this target's MCInstrInfo,
@@ -277,10 +279,11 @@ namespace llvm {
     /// createMCCodeGenInfo - Create a MCCodeGenInfo implementation.
     ///
     MCCodeGenInfo *createMCCodeGenInfo(StringRef Triple, Reloc::Model RM,
-                                       CodeModel::Model CM) const {
+                                       CodeModel::Model CM,
+                                       CodeGenOpt::Level OL) const {
       if (!MCCodeGenInfoCtorFn)
         return 0;
-      return MCCodeGenInfoCtorFn(Triple, RM, CM);
+      return MCCodeGenInfoCtorFn(Triple, RM, CM, OL);
     }
 
     /// createMCInstrInfo - Create a MCInstrInfo implementation.
@@ -331,12 +334,13 @@ namespace llvm {
     /// either the target triple from the module, or the target triple of the
     /// host if that does not exist.
     TargetMachine *createTargetMachine(StringRef Triple, StringRef CPU,
-                               StringRef Features,
-                               Reloc::Model RM = Reloc::Default,
-                               CodeModel::Model CM = CodeModel::Default) const {
+                             StringRef Features,
+                             Reloc::Model RM = Reloc::Default,
+                             CodeModel::Model CM = CodeModel::Default,
+                             CodeGenOpt::Level OL = CodeGenOpt::Default) const {
       if (!TargetMachineCtorFn)
         return 0;
-      return TargetMachineCtorFn(*this, Triple, CPU, Features, RM, CM);
+      return TargetMachineCtorFn(*this, Triple, CPU, Features, RM, CM, OL);
     }
 
     /// createMCAsmBackend - Create a target specific assembly parser.
@@ -843,8 +847,8 @@ namespace llvm {
       TargetRegistry::RegisterMCCodeGenInfo(T, &Allocator);
     }
   private:
-    static MCCodeGenInfo *Allocator(StringRef TT,
-                                    Reloc::Model RM, CodeModel::Model CM) {
+    static MCCodeGenInfo *Allocator(StringRef TT, Reloc::Model RM,
+                                    CodeModel::Model CM, CodeGenOpt::Level OL) {
       return new MCCodeGenInfoImpl();
     }
   };
@@ -1014,8 +1018,9 @@ namespace llvm {
     static TargetMachine *Allocator(const Target &T, StringRef TT,
                                     StringRef CPU, StringRef FS,
                                     Reloc::Model RM,
-                                    CodeModel::Model CM) {
-      return new TargetMachineImpl(T, TT, CPU, FS, RM, CM);
+                                    CodeModel::Model CM,
+                                    CodeGenOpt::Level OL) {
+      return new TargetMachineImpl(T, TT, CPU, FS, RM, CM, OL);
     }
   };
 
diff --git a/include/llvm/Target/TargetLibraryInfo.h b/include/llvm/Target/TargetLibraryInfo.h
index 02a1a3c..7770a11 100644
--- a/include/llvm/Target/TargetLibraryInfo.h
+++ b/include/llvm/Target/TargetLibraryInfo.h
@@ -11,6 +11,7 @@
 #define LLVM_TARGET_TARGETLIBRARYINFO_H
 
 #include "llvm/Pass.h"
+#include "llvm/ADT/DenseMap.h"
 
 namespace llvm {
   class Triple;
@@ -37,7 +38,14 @@ namespace llvm {
       
       /// int fiprintf(FILE *stream, const char *format, ...);
       fiprintf,
-      
+
+      // size_t fwrite(const void *ptr, size_t size, size_t nitems,
+      //               FILE *stream);
+      fwrite,
+
+      // int fputs(const char *s, FILE *stream);
+      fputs,
+
       NumLibFuncs
     };
   }
@@ -46,7 +54,23 @@ namespace llvm {
 /// library functions are available for the current target, and allows a
 /// frontend to disable optimizations through -fno-builtin etc.
 class TargetLibraryInfo : public ImmutablePass {
-  unsigned char AvailableArray[(LibFunc::NumLibFuncs+7)/8];
+  unsigned char AvailableArray[(LibFunc::NumLibFuncs+3)/4];
+  llvm::DenseMap<unsigned, std::string> CustomNames;
+  static const char* StandardNames[LibFunc::NumLibFuncs];
+
+  enum AvailabilityState {
+    StandardName = 3, // (memset to all ones)
+    CustomName = 1,
+    Unavailable = 0  // (memset to all zeros)
+  };
+  void setState(LibFunc::Func F, AvailabilityState State) {
+    AvailableArray[F/4] &= ~(3 << 2*(F&3));
+    AvailableArray[F/4] |= State << 2*(F&3);
+  }
+  AvailabilityState getState(LibFunc::Func F) const {
+    return static_cast<AvailabilityState>((AvailableArray[F/4] >> 2*(F&3)) & 3);
+  }
+
 public:
   static char ID;
   TargetLibraryInfo();
@@ -56,19 +80,39 @@ public:
   /// has - This function is used by optimizations that want to match on or form
   /// a given library function.
   bool has(LibFunc::Func F) const {
-    return (AvailableArray[F/8] & (1 << (F&7))) != 0;
+    return getState(F) != Unavailable;
+  }
+
+  StringRef getName(LibFunc::Func F) const {
+    AvailabilityState State = getState(F);
+    if (State == Unavailable)
+      return StringRef();
+    if (State == StandardName)
+      return StandardNames[F];
+    assert(State == CustomName);
+    return CustomNames.find(F)->second;
   }
 
   /// setUnavailable - this can be used by whatever sets up TargetLibraryInfo to
   /// ban use of specific library functions.
   void setUnavailable(LibFunc::Func F) {
-    AvailableArray[F/8] &= ~(1 << (F&7));
+    setState(F, Unavailable);
   }
 
   void setAvailable(LibFunc::Func F) {
-    AvailableArray[F/8] |= 1 << (F&7);
+    setState(F, StandardName);
   }
-  
+
+  void setAvailableWithName(LibFunc::Func F, StringRef Name) {
+    if (StandardNames[F] != Name) {
+      setState(F, CustomName);
+      CustomNames[F] = Name;
+      assert(CustomNames.find(F) != CustomNames.end());
+    } else {
+      setState(F, StandardName);
+    }
+  }
+
   /// disableAllFunctions - This disables all builtins, which is used for
   /// options like -fno-builtin.
   void disableAllFunctions();
diff --git a/include/llvm/Target/TargetMachine.h b/include/llvm/Target/TargetMachine.h
index 366a13c..db42350 100644
--- a/include/llvm/Target/TargetMachine.h
+++ b/include/llvm/Target/TargetMachine.h
@@ -43,16 +43,6 @@ class TargetSubtargetInfo;
 class formatted_raw_ostream;
 class raw_ostream;
 
-// Code generation optimization level.
-namespace CodeGenOpt {
-  enum Level {
-    None,        // -O0
-    Less,        // -O1
-    Default,     // -O2, -Os
-    Aggressive   // -O3
-  };
-}
-
 namespace Sched {
   enum Preference {
     None,             // No preference
@@ -212,6 +202,10 @@ public:
   /// medium, large, and target default.
   CodeModel::Model getCodeModel() const;
 
+  /// getOptLevel - Returns the optimization level: None, Less,
+  /// Default, or Aggressive.
+  CodeGenOpt::Level getOptLevel() const;
+
   /// getAsmVerbosityDefault - Returns the default value of asm verbosity.
   ///
   static bool getAsmVerbosityDefault();
@@ -255,7 +249,6 @@ public:
   virtual bool addPassesToEmitFile(PassManagerBase &,
                                    formatted_raw_ostream &,
                                    CodeGenFileType,
-                                   CodeGenOpt::Level,
                                    bool = true) {
     return true;
   }
@@ -268,7 +261,6 @@ public:
   ///
   virtual bool addPassesToEmitMachineCode(PassManagerBase &,
                                           JITCodeEmitter &,
-                                          CodeGenOpt::Level,
                                           bool = true) {
     return true;
   }
@@ -281,7 +273,6 @@ public:
   virtual bool addPassesToEmitMC(PassManagerBase &,
                                  MCContext *&,
                                  raw_ostream &,
-                                 CodeGenOpt::Level,
                                  bool = true) {
     return true;
   }
@@ -294,24 +285,23 @@ class LLVMTargetMachine : public TargetMachine {
 protected: // Can only create subclasses.
   LLVMTargetMachine(const Target &T, StringRef TargetTriple,
                     StringRef CPU, StringRef FS,
-                    Reloc::Model RM, CodeModel::Model CM);
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL);
 
 private:
   /// addCommonCodeGenPasses - Add standard LLVM codegen passes used for
   /// both emitting to assembly files or machine code output.
   ///
-  bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+  bool addCommonCodeGenPasses(PassManagerBase &,
                               bool DisableVerify, MCContext *&OutCtx);
 
 public:
   /// addPassesToEmitFile - Add passes to the specified pass manager to get the
   /// specified file emitted.  Typically this will involve several steps of code
-  /// generation.  If OptLevel is None, the code generator should emit code as
-  /// fast as possible, though the generated code may be less efficient.
+  /// generation.
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
                                    CodeGenFileType FileType,
-                                   CodeGenOpt::Level,
                                    bool DisableVerify = true);
 
   /// addPassesToEmitMachineCode - Add passes to the specified pass manager to
@@ -322,7 +312,6 @@ public:
   ///
   virtual bool addPassesToEmitMachineCode(PassManagerBase &PM,
                                           JITCodeEmitter &MCE,
-                                          CodeGenOpt::Level,
                                           bool DisableVerify = true);
 
   /// addPassesToEmitMC - Add passes to the specified pass manager to get
@@ -333,27 +322,26 @@ public:
   virtual bool addPassesToEmitMC(PassManagerBase &PM,
                                  MCContext *&Ctx,
                                  raw_ostream &OS,
-                                 CodeGenOpt::Level OptLevel,
                                  bool DisableVerify = true);
 
   /// Target-Independent Code Generator Pass Configuration Options.
 
   /// addPreISelPasses - This method should add any "last minute" LLVM->LLVM
   /// passes (which are run just before instruction selector).
-  virtual bool addPreISel(PassManagerBase &, CodeGenOpt::Level) {
+  virtual bool addPreISel(PassManagerBase &) {
     return true;
   }
 
   /// addInstSelector - This method should install an instruction selector pass,
   /// which converts from LLVM code to machine instructions.
-  virtual bool addInstSelector(PassManagerBase &, CodeGenOpt::Level) {
+  virtual bool addInstSelector(PassManagerBase &) {
     return true;
   }
 
   /// addPreRegAlloc - This method may be implemented by targets that want to
   /// run passes immediately before register allocation. This should return
   /// true if -print-machineinstrs should print after these passes.
-  virtual bool addPreRegAlloc(PassManagerBase &, CodeGenOpt::Level) {
+  virtual bool addPreRegAlloc(PassManagerBase &) {
     return false;
   }
 
@@ -361,7 +349,7 @@ public:
   /// to run passes after register allocation but before prolog-epilog
   /// insertion.  This should return true if -print-machineinstrs should print
   /// after these passes.
-  virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level) {
+  virtual bool addPostRegAlloc(PassManagerBase &) {
     return false;
   }
 
@@ -369,14 +357,14 @@ public:
   /// run passes after prolog-epilog insertion and before the second instruction
   /// scheduling pass.  This should return true if -print-machineinstrs should
   /// print after these passes.
-  virtual bool addPreSched2(PassManagerBase &, CodeGenOpt::Level) {
+  virtual bool addPreSched2(PassManagerBase &) {
     return false;
   }
 
   /// addPreEmitPass - This pass may be implemented by targets that want to run
   /// passes immediately before machine code is emitted.  This should return
   /// true if -print-machineinstrs should print out the code after the passes.
-  virtual bool addPreEmitPass(PassManagerBase &, CodeGenOpt::Level) {
+  virtual bool addPreEmitPass(PassManagerBase &) {
     return false;
   }
 
@@ -384,7 +372,7 @@ public:
   /// addCodeEmitter - This pass should be overridden by the target to add a
   /// code emitter, if supported.  If this is not supported, 'true' should be
   /// returned.
-  virtual bool addCodeEmitter(PassManagerBase &, CodeGenOpt::Level,
+  virtual bool addCodeEmitter(PassManagerBase &,
                               JITCodeEmitter &) {
     return true;
   }
diff --git a/include/llvm/Transforms/Instrumentation.h b/include/llvm/Transforms/Instrumentation.h
index 8d55231..f9b9623 100644
--- a/include/llvm/Transforms/Instrumentation.h
+++ b/include/llvm/Transforms/Instrumentation.h
@@ -31,6 +31,9 @@ ModulePass *createPathProfilerPass();
 ModulePass *createGCOVProfilerPass(bool EmitNotes = true, bool EmitData = true,
                                    bool Use402Format = false);
 
+// Insert AddressSanitizer (address sanity checking) instrumentation
+ModulePass *createAddressSanitizerPass();
+
 } // End llvm namespace
 
 #endif
diff --git a/include/llvm/Transforms/Utils/BuildLibCalls.h b/include/llvm/Transforms/Utils/BuildLibCalls.h
index e825938..17cd58e 100644
--- a/include/llvm/Transforms/Utils/BuildLibCalls.h
+++ b/include/llvm/Transforms/Utils/BuildLibCalls.h
@@ -20,6 +20,7 @@
 namespace llvm {
   class Value;
   class TargetData;
+  class TargetLibraryInfo;
   
   /// CastToCStr - Return V if it is an i8*, otherwise cast it to i8*.
   Value *CastToCStr(Value *V, IRBuilder<> &B);
@@ -68,7 +69,7 @@ namespace llvm {
   /// 'Op' and returns one value with the same type.  If 'Op' is a long double,
   /// 'l' is added as the suffix of name, if 'Op' is a float, we add a 'f'
   /// suffix.
-  Value *EmitUnaryFloatFnCall(Value *Op, const char *Name, IRBuilder<> &B,
+  Value *EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
                               const AttrListPtr &Attrs);
 
   /// EmitPutChar - Emit a call to the putchar function.  This assumes that Char
@@ -86,12 +87,13 @@ namespace llvm {
 
   /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
   /// pointer and File is a pointer to FILE.
-  void EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const TargetData *TD);
+  void EmitFPutS(Value *Str, Value *File, IRBuilder<> &B, const TargetData *TD,
+                 const TargetLibraryInfo *TLI);
 
   /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
   /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
   void EmitFWrite(Value *Ptr, Value *Size, Value *File, IRBuilder<> &B,
-                  const TargetData *TD);
+                  const TargetData *TD, const TargetLibraryInfo *TLI);
 
   /// SimplifyFortifiedLibCalls - Helper class for folding checked library
   /// calls (e.g. __strcpy_chk) into their unchecked counterparts.
diff --git a/include/llvm/Value.h b/include/llvm/Value.h
index a71e2fd..9b47143 100644
--- a/include/llvm/Value.h
+++ b/include/llvm/Value.h
@@ -117,19 +117,8 @@ public:
   /// getName() - Return a constant reference to the value's name. This is cheap
   /// and guaranteed to return the same reference as long as the value is not
   /// modified.
-  ///
-  /// This is currently guaranteed to return a StringRef for which data() points
-  /// to a valid null terminated string. The use of StringRef.data() is 
-  /// deprecated here, however, and clients should not rely on it. If such 
-  /// behavior is needed, clients should use expensive getNameStr(), or switch 
-  /// to an interface that does not depend on null termination.
   StringRef getName() const;
 
-  /// getNameStr() - Return the name of the specified value, *constructing a
-  /// string* to hold it.  This is guaranteed to construct a string and is very
-  /// expensive, clients should use getName() unless necessary.
-  std::string getNameStr() const;
-
   /// setName() - Change the name of the value, choosing a new unique name if
   /// the provided name is taken.
   ///
diff --git a/lib/Analysis/BasicAliasAnalysis.cpp b/lib/Analysis/BasicAliasAnalysis.cpp
index af400ba..568983a 100644
--- a/lib/Analysis/BasicAliasAnalysis.cpp
+++ b/lib/Analysis/BasicAliasAnalysis.cpp
@@ -706,8 +706,7 @@ BasicAliasAnalysis::getModRefInfo(ImmutableCallSite CS,
       // pointer were passed to arguments that were neither of these, then it
       // couldn't be no-capture.
       if (!(*CI)->getType()->isPointerTy() ||
-          (!CS.paramHasAttr(ArgNo+1, Attribute::NoCapture) &&
-           !CS.paramHasAttr(ArgNo+1, Attribute::ByVal)))
+          (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
         continue;
       
       // If this is a no-capture pointer argument, see if we can tell that it
diff --git a/lib/Analysis/BranchProbabilityInfo.cpp b/lib/Analysis/BranchProbabilityInfo.cpp
index 258fe54..f9461c0 100644
--- a/lib/Analysis/BranchProbabilityInfo.cpp
+++ b/lib/Analysis/BranchProbabilityInfo.cpp
@@ -480,8 +480,8 @@ getEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst) const {
 void BranchProbabilityInfo::
 setEdgeWeight(const BasicBlock *Src, const BasicBlock *Dst, uint32_t Weight) {
   Weights[std::make_pair(Src, Dst)] = Weight;
-  DEBUG(dbgs() << "set edge " << Src->getNameStr() << " -> "
-               << Dst->getNameStr() << " weight to " << Weight
+  DEBUG(dbgs() << "set edge " << Src->getName() << " -> "
+               << Dst->getName() << " weight to " << Weight
                << (isEdgeHot(Src, Dst) ? " [is HOT now]\n" : "\n"));
 }
 
@@ -501,7 +501,7 @@ BranchProbabilityInfo::printEdgeProbability(raw_ostream &OS,
                                             const BasicBlock *Dst) const {
 
   const BranchProbability Prob = getEdgeProbability(Src, Dst);
-  OS << "edge " << Src->getNameStr() << " -> " << Dst->getNameStr()
+  OS << "edge " << Src->getName() << " -> " << Dst->getName()
      << " probability is " << Prob
      << (isEdgeHot(Src, Dst) ? " [HOT edge]\n" : "\n");
 
diff --git a/lib/Analysis/CFGPrinter.cpp b/lib/Analysis/CFGPrinter.cpp
index 7bb063f..7685400 100644
--- a/lib/Analysis/CFGPrinter.cpp
+++ b/lib/Analysis/CFGPrinter.cpp
@@ -77,7 +77,7 @@ namespace {
     }
 
     virtual bool runOnFunction(Function &F) {
-      std::string Filename = "cfg." + F.getNameStr() + ".dot";
+      std::string Filename = "cfg." + F.getName().str() + ".dot";
       errs() << "Writing '" << Filename << "'...";
       
       std::string ErrorInfo;
@@ -111,7 +111,7 @@ namespace {
     }
     
     virtual bool runOnFunction(Function &F) {
-      std::string Filename = "cfg." + F.getNameStr() + ".dot";
+      std::string Filename = "cfg." + F.getName().str() + ".dot";
       errs() << "Writing '" << Filename << "'...";
 
       std::string ErrorInfo;
@@ -143,7 +143,7 @@ INITIALIZE_PASS(CFGOnlyPrinter, "dot-cfg-only",
 /// being a 'dot' and 'gv' program in your path.
 ///
 void Function::viewCFG() const {
-  ViewGraph(this, "cfg" + getNameStr());
+  ViewGraph(this, "cfg" + getName());
 }
 
 /// viewCFGOnly - This function is meant for use from the debugger.  It works
@@ -152,7 +152,7 @@ void Function::viewCFG() const {
 /// his can make the graph smaller.
 ///
 void Function::viewCFGOnly() const {
-  ViewGraph(this, "cfg" + getNameStr(), true);
+  ViewGraph(this, "cfg" + getName(), true);
 }
 
 FunctionPass *llvm::createCFGPrinterPass () {
diff --git a/lib/Analysis/CaptureTracking.cpp b/lib/Analysis/CaptureTracking.cpp
index a84dafb..9a7992e 100644
--- a/lib/Analysis/CaptureTracking.cpp
+++ b/lib/Analysis/CaptureTracking.cpp
@@ -19,8 +19,10 @@
 #include "llvm/Analysis/CaptureTracking.h"
 using namespace llvm;
 
+CaptureTracker::~CaptureTracker() {}
+
 namespace {
-  struct SimpleCaptureTracker {
+  struct SimpleCaptureTracker : public CaptureTracker {
     explicit SimpleCaptureTracker(bool ReturnCaptures)
       : ReturnCaptures(ReturnCaptures), Captured(false) {}
 
@@ -51,6 +53,9 @@ namespace {
 /// counts as capturing it or not.
 bool llvm::PointerMayBeCaptured(const Value *V,
                                 bool ReturnCaptures, bool StoreCaptures) {
+  assert(!isa<GlobalValue>(V) &&
+         "It doesn't make sense to ask whether a global is captured.");
+
   // TODO: If StoreCaptures is not true, we could do Fancy analysis
   // to determine whether this store is not actually an escape point.
   // In that case, BasicAliasAnalysis should be updated as well to
@@ -58,6 +63,111 @@ bool llvm::PointerMayBeCaptured(const Value *V,
   (void)StoreCaptures;
 
   SimpleCaptureTracker SCT(ReturnCaptures);
-  PointerMayBeCaptured(V, SCT);
+  PointerMayBeCaptured(V, &SCT);
   return SCT.Captured;
 }
+
+/// TODO: Write a new FunctionPass AliasAnalysis so that it can keep
+/// a cache. Then we can move the code from BasicAliasAnalysis into
+/// that path, and remove this threshold.
+static int const Threshold = 20;
+
+void llvm::PointerMayBeCaptured(const Value *V, CaptureTracker *Tracker) {
+  assert(V->getType()->isPointerTy() && "Capture is for pointers only!");
+  SmallVector<Use*, Threshold> Worklist;
+  SmallSet<Use*, Threshold> Visited;
+  int Count = 0;
+
+  for (Value::const_use_iterator UI = V->use_begin(), UE = V->use_end();
+       UI != UE; ++UI) {
+    // If there are lots of uses, conservatively say that the value
+    // is captured to avoid taking too much compile time.
+    if (Count++ >= Threshold)
+      return Tracker->tooManyUses();
+
+    Use *U = &UI.getUse();
+    if (!Tracker->shouldExplore(U)) continue;
+    Visited.insert(U);
+    Worklist.push_back(U);
+  }
+
+  while (!Worklist.empty()) {
+    Use *U = Worklist.pop_back_val();
+    Instruction *I = cast<Instruction>(U->getUser());
+    V = U->get();
+
+    switch (I->getOpcode()) {
+    case Instruction::Call:
+    case Instruction::Invoke: {
+      CallSite CS(I);
+      // Not captured if the callee is readonly, doesn't return a copy through
+      // its return value and doesn't unwind (a readonly function can leak bits
+      // by throwing an exception or not depending on the input value).
+      if (CS.onlyReadsMemory() && CS.doesNotThrow() && I->getType()->isVoidTy())
+        break;
+
+      // Not captured if only passed via 'nocapture' arguments.  Note that
+      // calling a function pointer does not in itself cause the pointer to
+      // be captured.  This is a subtle point considering that (for example)
+      // the callee might return its own address.  It is analogous to saying
+      // that loading a value from a pointer does not cause the pointer to be
+      // captured, even though the loaded value might be the pointer itself
+      // (think of self-referential objects).
+      CallSite::arg_iterator B = CS.arg_begin(), E = CS.arg_end();
+      for (CallSite::arg_iterator A = B; A != E; ++A)
+        if (A->get() == V && !CS.doesNotCapture(A - B))
+          // The parameter is not marked 'nocapture' - captured.
+          if (Tracker->captured(I))
+            return;
+      break;
+    }
+    case Instruction::Load:
+      // Loading from a pointer does not cause it to be captured.
+      break;
+    case Instruction::VAArg:
+      // "va-arg" from a pointer does not cause it to be captured.
+      break;
+    case Instruction::Store:
+      if (V == I->getOperand(0))
+        // Stored the pointer - conservatively assume it may be captured.
+        if (Tracker->captured(I))
+          return;
+      // Storing to the pointee does not cause the pointer to be captured.
+      break;
+    case Instruction::BitCast:
+    case Instruction::GetElementPtr:
+    case Instruction::PHI:
+    case Instruction::Select:
+      // The original value is not captured via this if the new value isn't.
+      for (Instruction::use_iterator UI = I->use_begin(), UE = I->use_end();
+           UI != UE; ++UI) {
+        Use *U = &UI.getUse();
+        if (Visited.insert(U))
+          if (Tracker->shouldExplore(U))
+            Worklist.push_back(U);
+      }
+      break;
+    case Instruction::ICmp:
+      // Don't count comparisons of a no-alias return value against null as
+      // captures. This allows us to ignore comparisons of malloc results
+      // with null, for example.
+      if (isNoAliasCall(V->stripPointerCasts()))
+        if (ConstantPointerNull *CPN =
+              dyn_cast<ConstantPointerNull>(I->getOperand(1)))
+          if (CPN->getType()->getAddressSpace() == 0)
+            break;
+      // Otherwise, be conservative. There are crazy ways to capture pointers
+      // using comparisons.
+      if (Tracker->captured(I))
+        return;
+      break;
+    default:
+      // Something else - be conservative and say it is captured.
+      if (Tracker->captured(I))
+        return;
+      break;
+    }
+  }
+
+  // All uses examined.
+}
diff --git a/lib/Analysis/LoopInfo.cpp b/lib/Analysis/LoopInfo.cpp
index 85aacca..c7833bf 100644
--- a/lib/Analysis/LoopInfo.cpp
+++ b/lib/Analysis/LoopInfo.cpp
@@ -477,21 +477,19 @@ void UnloopUpdater::updateBlockParents() {
 /// removeBlocksFromAncestors - Remove unloop's blocks from all ancestors below
 /// their new parents.
 void UnloopUpdater::removeBlocksFromAncestors() {
-  // Remove unloop's blocks from all ancestors below their new parents.
+  // Remove all unloop's blocks (including those in nested subloops) from
+  // ancestors below the new parent loop.
   for (Loop::block_iterator BI = Unloop->block_begin(),
          BE = Unloop->block_end(); BI != BE; ++BI) {
-    Loop *NewParent = LI->getLoopFor(*BI);
-    // If this block is an immediate subloop, remove all blocks (including
-    // nested subloops) from ancestors below the new parent loop.
-    // Otherwise, if this block is in a nested subloop, skip it.
-    if (SubloopParents.count(NewParent))
-      NewParent = SubloopParents[NewParent];
-    else if (Unloop->contains(NewParent))
-      continue;
-
+    Loop *OuterParent = LI->getLoopFor(*BI);
+    if (Unloop->contains(OuterParent)) {
+      while (OuterParent->getParentLoop() != Unloop)
+        OuterParent = OuterParent->getParentLoop();
+      OuterParent = SubloopParents[OuterParent];
+    }
     // Remove blocks from former Ancestors except Unloop itself which will be
     // deleted.
-    for (Loop *OldParent = Unloop->getParentLoop(); OldParent != NewParent;
+    for (Loop *OldParent = Unloop->getParentLoop(); OldParent != OuterParent;
          OldParent = OldParent->getParentLoop()) {
       assert(OldParent && "new loop is not an ancestor of the original");
       OldParent->removeBlockFromLoop(*BI);
diff --git a/lib/Analysis/MemoryBuiltins.cpp b/lib/Analysis/MemoryBuiltins.cpp
index 8d451c4..b145650 100644
--- a/lib/Analysis/MemoryBuiltins.cpp
+++ b/lib/Analysis/MemoryBuiltins.cpp
@@ -48,10 +48,10 @@ static bool isMallocCall(const CallInst *CI) {
   // FIXME: workaround for PR5130, this will be obsolete when a nobuiltin 
   // attribute will exist.
   FunctionType *FTy = Callee->getFunctionType();
-  if (FTy->getNumParams() != 1)
-    return false;
-  return FTy->getParamType(0)->isIntegerTy(32) ||
-         FTy->getParamType(0)->isIntegerTy(64);
+  return FTy->getReturnType() == Type::getInt8PtrTy(FTy->getContext()) &&
+         FTy->getNumParams() == 1 &&
+         (FTy->getParamType(0)->isIntegerTy(32) ||
+          FTy->getParamType(0)->isIntegerTy(64));
 }
 
 /// extractMallocCall - Returns the corresponding CallInst if the instruction
diff --git a/lib/Analysis/MemoryDependenceAnalysis.cpp b/lib/Analysis/MemoryDependenceAnalysis.cpp
index 323c84f..704e27b 100644
--- a/lib/Analysis/MemoryDependenceAnalysis.cpp
+++ b/lib/Analysis/MemoryDependenceAnalysis.cpp
@@ -336,7 +336,7 @@ getLoadLoadClobberFullWidthSize(const Value *MemLocBase, int64_t MemLocOffs,
 namespace {
   /// Only find pointer captures which happen before the given instruction. Uses
   /// the dominator tree to determine whether one instruction is before another.
-  struct CapturesBefore {
+  struct CapturesBefore : public CaptureTracker {
     CapturesBefore(const Instruction *I, DominatorTree *DT)
       : BeforeHere(I), DT(DT), Captured(false) {}
 
@@ -375,13 +375,13 @@ MemoryDependenceAnalysis::getModRefInfo(const Instruction *Inst,
   // with a smarter AA in place, this test is just wasting compile time.
   if (!DT) return AliasAnalysis::ModRef;
   const Value *Object = GetUnderlyingObject(MemLoc.Ptr, TD);
-  if (!isIdentifiedObject(Object) || isa<GlobalVariable>(Object))
+  if (!isIdentifiedObject(Object) || isa<GlobalValue>(Object))
     return AliasAnalysis::ModRef;
   ImmutableCallSite CS(Inst);
   if (!CS.getInstruction()) return AliasAnalysis::ModRef;
 
   CapturesBefore CB(Inst, DT);
-  llvm::PointerMayBeCaptured(Object, CB);
+  llvm::PointerMayBeCaptured(Object, &CB);
 
   if (isa<Constant>(Object) || CS.getInstruction() == Object || CB.Captured)
     return AliasAnalysis::ModRef;
@@ -393,8 +393,7 @@ MemoryDependenceAnalysis::getModRefInfo(const Instruction *Inst,
     // pointer were passed to arguments that were neither of these, then it
     // couldn't be no-capture.
     if (!(*CI)->getType()->isPointerTy() ||
-        (!CS.paramHasAttr(ArgNo+1, Attribute::NoCapture) &&
-         !CS.paramHasAttr(ArgNo+1, Attribute::ByVal)))
+        (!CS.doesNotCapture(ArgNo) && !CS.isByValArgument(ArgNo)))
       continue;
 
     // If this is a no-capture pointer argument, see if we can tell that it
diff --git a/lib/Analysis/PathProfileVerifier.cpp b/lib/Analysis/PathProfileVerifier.cpp
index 0ae734e..0fcdfe7 100644
--- a/lib/Analysis/PathProfileVerifier.cpp
+++ b/lib/Analysis/PathProfileVerifier.cpp
@@ -137,22 +137,22 @@ bool PathProfileVerifier::runOnModule (Module &M) {
         BasicBlock* source = nextEdge->getSource();
         BasicBlock* target = nextEdge->getTarget();
         unsigned duplicateNumber = nextEdge->getDuplicateNumber();
-        DEBUG(dbgs () << source->getNameStr() << " --{" << duplicateNumber
-              << "}--> " << target->getNameStr());
+        DEBUG(dbgs() << source->getName() << " --{" << duplicateNumber
+                     << "}--> " << target->getName());
 
         // Ensure all the referenced edges exist
         // TODO: make this a separate function
         if( !arrayMap.count(source) ) {
-          errs() << "  error [" << F->getNameStr() << "()]: source '"
-                 << source->getNameStr()
+          errs() << "  error [" << F->getName() << "()]: source '"
+                 << source->getName()
                  << "' does not exist in the array map.\n";
         } else if( !arrayMap[source].count(target) ) {
-          errs() << "  error [" << F->getNameStr() << "()]: target '"
-                 << target->getNameStr()
+          errs() << "  error [" << F->getName() << "()]: target '"
+                 << target->getName()
                  << "' does not exist in the array map.\n";
         } else if( !arrayMap[source][target].count(duplicateNumber) ) {
-          errs() << "  error [" << F->getNameStr() << "()]: edge "
-                 << source->getNameStr() << " -> " << target->getNameStr()
+          errs() << "  error [" << F->getName() << "()]: edge "
+                 << source->getName() << " -> " << target->getName()
                  << " duplicate number " << duplicateNumber
                  << " does not exist in the array map.\n";
         } else {
diff --git a/lib/Analysis/ProfileEstimatorPass.cpp b/lib/Analysis/ProfileEstimatorPass.cpp
index b594e2b..63468f8 100644
--- a/lib/Analysis/ProfileEstimatorPass.cpp
+++ b/lib/Analysis/ProfileEstimatorPass.cpp
@@ -332,7 +332,7 @@ bool ProfileEstimatorPass::runOnFunction(Function &F) {
   // Clear Minimal Edges.
   MinimalWeight.clear();
 
-  DEBUG(dbgs() << "Working on function " << F.getNameStr() << "\n");
+  DEBUG(dbgs() << "Working on function " << F.getName() << "\n");
 
   // Since the entry block is the first one and has no predecessors, the edge
   // (0,entry) is inserted with the starting weight of 1.
diff --git a/lib/Analysis/ProfileInfoLoaderPass.cpp b/lib/Analysis/ProfileInfoLoaderPass.cpp
index 098079b..c4da807 100644
--- a/lib/Analysis/ProfileInfoLoaderPass.cpp
+++ b/lib/Analysis/ProfileInfoLoaderPass.cpp
@@ -160,7 +160,7 @@ bool LoaderPass::runOnModule(Module &M) {
     ReadCount = 0;
     for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
       if (F->isDeclaration()) continue;
-      DEBUG(dbgs()<<"Working on "<<F->getNameStr()<<"\n");
+      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
       readEdge(getEdge(0,&F->getEntryBlock()), Counters);
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
         TerminatorInst *TI = BB->getTerminator();
@@ -181,7 +181,7 @@ bool LoaderPass::runOnModule(Module &M) {
     ReadCount = 0;
     for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
       if (F->isDeclaration()) continue;
-      DEBUG(dbgs()<<"Working on "<<F->getNameStr()<<"\n");
+      DEBUG(dbgs() << "Working on " << F->getName() << "\n");
       readEdge(getEdge(0,&F->getEntryBlock()), Counters);
       for (Function::iterator BB = F->begin(), E = F->end(); BB != E; ++BB) {
         TerminatorInst *TI = BB->getTerminator();
diff --git a/lib/Analysis/ProfileVerifierPass.cpp b/lib/Analysis/ProfileVerifierPass.cpp
index a017518..379d79c 100644
--- a/lib/Analysis/ProfileVerifierPass.cpp
+++ b/lib/Analysis/ProfileVerifierPass.cpp
@@ -125,8 +125,8 @@ namespace llvm {
         outCount++;
       }
     }
-    dbgs() << "Block " << BB->getNameStr()                << " in " 
-           << BB->getParent()->getNameStr()               << ":"
+    dbgs() << "Block " << BB->getName()                   << " in "
+           << BB->getParent()->getName()                  << ":"
            << "BBWeight="  << format("%20.20g",BBWeight)  << ","
            << "inWeight="  << format("%20.20g",inWeight)  << ","
            << "inCount="   << inCount                     << ","
@@ -143,8 +143,8 @@ namespace llvm {
 
   template<class FType, class BType>
   void ProfileVerifierPassT<FType, BType>::debugEntry (DetailedBlockInfo *DI) {
-    dbgs() << "TROUBLE: Block " << DI->BB->getNameStr()       << " in "
-           << DI->BB->getParent()->getNameStr()               << ":"
+    dbgs() << "TROUBLE: Block " << DI->BB->getName()          << " in "
+           << DI->BB->getParent()->getName()                  << ":"
            << "BBWeight="  << format("%20.20g",DI->BBWeight)  << ","
            << "inWeight="  << format("%20.20g",DI->inWeight)  << ","
            << "inCount="   << DI->inCount                     << ","
@@ -201,13 +201,13 @@ namespace llvm {
     double EdgeWeight = PI->getEdgeWeight(E);
     if (EdgeWeight == ProfileInfoT<FType, BType>::MissingValue) {
       dbgs() << "Edge " << E << " in Function " 
-             << ProfileInfoT<FType, BType>::getFunction(E)->getNameStr() << ": ";
+             << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
       ASSERTMESSAGE("Edge has missing value");
       return 0;
     } else {
       if (EdgeWeight < 0) {
         dbgs() << "Edge " << E << " in Function " 
-               << ProfileInfoT<FType, BType>::getFunction(E)->getNameStr() << ": ";
+               << ProfileInfoT<FType, BType>::getFunction(E)->getName() << ": ";
         ASSERTMESSAGE("Edge has negative value");
       }
       return EdgeWeight;
@@ -220,8 +220,8 @@ namespace llvm {
                                                       DetailedBlockInfo *DI) {
     if (Error) {
       DEBUG(debugEntry(DI));
-      dbgs() << "Block " << DI->BB->getNameStr() << " in Function " 
-             << DI->BB->getParent()->getNameStr() << ": ";
+      dbgs() << "Block " << DI->BB->getName() << " in Function "
+             << DI->BB->getParent()->getName() << ": ";
       ASSERTMESSAGE(Message);
     }
     return;
diff --git a/lib/Analysis/RegionInfo.cpp b/lib/Analysis/RegionInfo.cpp
index 52753cb..828913d 100644
--- a/lib/Analysis/RegionInfo.cpp
+++ b/lib/Analysis/RegionInfo.cpp
@@ -186,18 +186,16 @@ std::string Region::getNameStr() const {
     raw_string_ostream OS(entryName);
 
     WriteAsOperand(OS, getEntry(), false);
-    entryName = OS.str();
   } else
-    entryName = getEntry()->getNameStr();
+    entryName = getEntry()->getName();
 
   if (getExit()) {
     if (getExit()->getName().empty()) {
       raw_string_ostream OS(exitName);
 
       WriteAsOperand(OS, getExit(), false);
-      exitName = OS.str();
     } else
-      exitName = getExit()->getNameStr();
+      exitName = getExit()->getName();
   } else
     exitName = "<Function Return>";
 
diff --git a/lib/Analysis/ScalarEvolution.cpp b/lib/Analysis/ScalarEvolution.cpp
index ac00259..622b214 100644
--- a/lib/Analysis/ScalarEvolution.cpp
+++ b/lib/Analysis/ScalarEvolution.cpp
@@ -3601,9 +3601,9 @@ const SCEV *ScalarEvolution::createSCEV(Value *V) {
     SCEV::NoWrapFlags Flags = SCEV::FlagAnyWrap;
     OverflowingBinaryOperator *OBO = cast<OverflowingBinaryOperator>(V);
     if (OBO->hasNoSignedWrap())
-      setFlags(Flags, SCEV::FlagNSW);
+      Flags = setFlags(Flags, SCEV::FlagNSW);
     if (OBO->hasNoUnsignedWrap())
-      setFlags(Flags, SCEV::FlagNUW);
+      Flags = setFlags(Flags, SCEV::FlagNUW);
     return getAddExpr(AddOps, Flags);
   }
   case Instruction::Mul: {
@@ -4153,13 +4153,19 @@ void ScalarEvolution::forgetValue(Value *V) {
 }
 
 /// getExact - Get the exact loop backedge taken count considering all loop
-/// exits. If all exits are computable, this is the minimum computed count.
+/// exits. A computable result can only be return for loops with a single exit.
+/// Returning the minimum taken count among all exits is incorrect because one
+/// of the loop's exit limit's may have been skipped. HowFarToZero assumes that
+/// the limit of each loop test is never skipped. This is a valid assumption as
+/// long as the loop exits via that test. For precise results, it is the
+/// caller's responsibility to specify the relevant loop exit using
+/// getExact(ExitingBlock, SE).
 const SCEV *
 ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
   // If any exits were not computable, the loop is not computable.
   if (!ExitNotTaken.isCompleteList()) return SE->getCouldNotCompute();
 
-  // We need at least one computable exit.
+  // We need exactly one computable exit.
   if (!ExitNotTaken.ExitingBlock) return SE->getCouldNotCompute();
   assert(ExitNotTaken.ExactNotTaken && "uninitialized not-taken info");
 
@@ -4171,8 +4177,8 @@ ScalarEvolution::BackedgeTakenInfo::getExact(ScalarEvolution *SE) const {
 
     if (!BECount)
       BECount = ENT->ExactNotTaken;
-    else
-      BECount = SE->getUMinFromMismatchedTypes(BECount, ENT->ExactNotTaken);
+    else if (BECount != ENT->ExactNotTaken)
+      return SE->getCouldNotCompute();
   }
   assert(BECount && "Invalid not taken count for loop exit");
   return BECount;
@@ -4253,8 +4259,15 @@ ScalarEvolution::ComputeBackedgeTakenCount(const Loop *L) {
 
     if (MaxBECount == getCouldNotCompute())
       MaxBECount = EL.Max;
-    else if (EL.Max != getCouldNotCompute())
-      MaxBECount = getUMinFromMismatchedTypes(MaxBECount, EL.Max);
+    else if (EL.Max != getCouldNotCompute()) {
+      // We cannot take the "min" MaxBECount, because non-unit stride loops may
+      // skip some loop tests. Taking the max over the exits is sufficiently
+      // conservative.  TODO: We could do better taking into consideration
+      // that (1) the loop has unit stride (2) the last loop test is
+      // less-than/greater-than (3) any loop test is less-than/greater-than AND
+      // falls-through some constant times less then the other tests.
+      MaxBECount = getUMaxFromMismatchedTypes(MaxBECount, EL.Max);
+    }
   }
 
   return BackedgeTakenInfo(ExitCounts, CouldComputeBECount, MaxBECount);
@@ -4920,7 +4933,7 @@ const SCEV *ScalarEvolution::ComputeExitCountExhaustively(const Loop *L,
   // the loop symbolically to determine when the condition gets a value of
   // "ExitWhen".
 
-  unsigned MaxIterations = MaxBruteForceIterations;   // Limit analysis.  
+  unsigned MaxIterations = MaxBruteForceIterations;   // Limit analysis.
   for (unsigned IterationNum = 0; IterationNum != MaxIterations;++IterationNum){
     ConstantInt *CondVal =
       dyn_cast_or_null<ConstantInt>(EvaluateExpression(Cond, L,
@@ -5507,10 +5520,10 @@ ScalarEvolution::HowFarToZero(const SCEV *V, const Loop *L) {
   // behavior. Loops must exhibit defined behavior until a wrapped value is
   // actually used. So the trip count computed by udiv could be smaller than the
   // number of well-defined iterations.
-  if (AddRec->getNoWrapFlags(SCEV::FlagNW))
+  if (AddRec->getNoWrapFlags(SCEV::FlagNW)) {
     // FIXME: We really want an "isexact" bit for udiv.
     return getUDivExpr(Distance, CountDown ? getNegativeSCEV(Step) : Step);
-
+  }
   // Then, try to solve the above equation provided that Start is constant.
   if (const SCEVConstant *StartC = dyn_cast<SCEVConstant>(Start))
     return SolveLinEquationWithOverflow(StepC->getValue()->getValue(),
diff --git a/lib/Analysis/SparsePropagation.cpp b/lib/Analysis/SparsePropagation.cpp
index d8c207b..035bcea 100644
--- a/lib/Analysis/SparsePropagation.cpp
+++ b/lib/Analysis/SparsePropagation.cpp
@@ -327,13 +327,13 @@ void SparseSolver::Solve(Function &F) {
 }
 
 void SparseSolver::Print(Function &F, raw_ostream &OS) const {
-  OS << "\nFUNCTION: " << F.getNameStr() << "\n";
+  OS << "\nFUNCTION: " << F.getName() << "\n";
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
     if (!BBExecutable.count(BB))
       OS << "INFEASIBLE: ";
     OS << "\t";
     if (BB->hasName())
-      OS << BB->getNameStr() << ":\n";
+      OS << BB->getName() << ":\n";
     else
       OS << "; anon bb\n";
     for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
diff --git a/lib/Analysis/Trace.cpp b/lib/Analysis/Trace.cpp
index 68a39cd..ff5010b 100644
--- a/lib/Analysis/Trace.cpp
+++ b/lib/Analysis/Trace.cpp
@@ -34,7 +34,7 @@ Module *Trace::getModule() const {
 ///
 void Trace::print(raw_ostream &O) const {
   Function *F = getFunction();
-  O << "; Trace from function " << F->getNameStr() << ", blocks:\n";
+  O << "; Trace from function " << F->getName() << ", blocks:\n";
   for (const_iterator i = begin(), e = end(); i != e; ++i) {
     O << "; ";
     WriteAsOperand(O, *i, true, getModule());
diff --git a/lib/Analysis/ValueTracking.cpp b/lib/Analysis/ValueTracking.cpp
index 9f7b5b5..22f1c14 100644
--- a/lib/Analysis/ValueTracking.cpp
+++ b/lib/Analysis/ValueTracking.cpp
@@ -248,9 +248,14 @@ void llvm::ComputeMaskedBits(Value *V, const APInt &Mask,
                 APInt::getHighBitsSet(BitWidth, LeadZ);
     KnownZero &= Mask;
 
-    if (isKnownNonNegative)
+    // Only make use of no-wrap flags if we failed to compute the sign bit
+    // directly.  This matters if the multiplication always overflows, in
+    // which case we prefer to follow the result of the direct computation,
+    // though as the program is invoking undefined behaviour we can choose
+    // whatever we like here.
+    if (isKnownNonNegative && !KnownOne.isNegative())
       KnownZero.setBit(BitWidth - 1);
-    else if (isKnownNegative)
+    else if (isKnownNegative && !KnownZero.isNegative())
       KnownOne.setBit(BitWidth - 1);
 
     return;
diff --git a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
index a08b61c..711b796 100644
--- a/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
+++ b/lib/CodeGen/AsmPrinter/AsmPrinter.cpp
@@ -749,6 +749,18 @@ void AsmPrinter::EmitFunctionBody() {
       OutStreamer.EmitRawText(StringRef("\tnop\n"));
   }
 
+  const Function *F = MF->getFunction();
+  for (Function::const_iterator i = F->begin(), e = F->end(); i != e; ++i) {
+    const BasicBlock *BB = i;
+    if (!BB->hasAddressTaken())
+      continue;
+    MCSymbol *Sym = GetBlockAddressSymbol(BB);
+    if (Sym->isDefined())
+      continue;
+    OutStreamer.AddComment("Address of block that was removed by CodeGen");
+    OutStreamer.EmitLabel(Sym);
+  }
+
   // Emit target-specific gunk after the function body.
   EmitFunctionBodyEnd();
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
index a3a2488..6c77a63 100644
--- a/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfAccelTable.cpp
@@ -60,7 +60,7 @@ void DwarfAccelTable::ComputeBucketCount(void) {
   uniques.resize(Data.size());
   for (size_t i = 0, e = Data.size(); i < e; ++i)
     uniques[i] = Data[i]->HashValue;
-  std::sort(uniques.begin(), uniques.end());
+  std::stable_sort(uniques.begin(), uniques.end());
   std::vector<uint32_t>::iterator p =
     std::unique(uniques.begin(), uniques.end());
   uint32_t num = std::distance(uniques.begin(), p);
@@ -73,6 +73,15 @@ void DwarfAccelTable::ComputeBucketCount(void) {
   Header.hashes_count = num;
 }
 
+namespace {
+  // DIESorter - comparison predicate that sorts DIEs by their offset.
+  struct DIESorter {
+    bool operator()(DIE *A, DIE *B) const {
+      return A->getOffset() < B->getOffset();
+    }
+  };
+}
+
 void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, const char *Prefix) {
   // Create the individual hash data outputs.
   for (StringMap<DIEArray>::iterator
@@ -80,7 +89,7 @@ void DwarfAccelTable::FinalizeTable(AsmPrinter *Asm, const char *Prefix) {
     struct HashData *Entry = new HashData((*EI).getKeyData());
 
     // Unique the entries.
-    std::sort((*EI).second.begin(), (*EI).second.end());
+    std::stable_sort((*EI).second.begin(), (*EI).second.end(), DIESorter());
     (*EI).second.erase(std::unique((*EI).second.begin(), (*EI).second.end()),
                        (*EI).second.end());
 
diff --git a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
index 159c096..237998a 100644
--- a/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
+++ b/lib/CodeGen/AsmPrinter/DwarfCompileUnit.cpp
@@ -1154,8 +1154,8 @@ void CompileUnit::createGlobalVariableDIE(const MDNode *N) {
 void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy){
   DIE *DW_Subrange = new DIE(dwarf::DW_TAG_subrange_type);
   addDIEEntry(DW_Subrange, dwarf::DW_AT_type, dwarf::DW_FORM_ref4, IndexTy);
-  int64_t L = SR.getLo();
-  int64_t H = SR.getHi();
+  uint64_t L = SR.getLo();
+  uint64_t H = SR.getHi();
 
   // The L value defines the lower bounds which is typically zero for C/C++. The
   // H value is the upper bounds.  Values are 64 bit.  H - L + 1 is the size
@@ -1168,8 +1168,8 @@ void CompileUnit::constructSubrangeDIE(DIE &Buffer, DISubrange SR, DIE *IndexTy)
     return;
   }
   if (L)
-    addSInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
-  addSInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
+    addUInt(DW_Subrange, dwarf::DW_AT_lower_bound, 0, L);
+  addUInt(DW_Subrange, dwarf::DW_AT_upper_bound, 0, H);
   Buffer.addChild(DW_Subrange);
 }
 
diff --git a/lib/CodeGen/EdgeBundles.cpp b/lib/CodeGen/EdgeBundles.cpp
index a7aba89..3bb0465 100644
--- a/lib/CodeGen/EdgeBundles.cpp
+++ b/lib/CodeGen/EdgeBundles.cpp
@@ -77,7 +77,7 @@ void EdgeBundles::view() const {
 /// Specialize WriteGraph, the standard implementation won't work.
 raw_ostream &llvm::WriteGraph(raw_ostream &O, const EdgeBundles &G,
                               bool ShortNames,
-                              const std::string &Title) {
+                              const Twine &Title) {
   const MachineFunction *MF = G.getMachineFunction();
 
   O << "digraph {\n";
diff --git a/lib/CodeGen/ExecutionDepsFix.cpp b/lib/CodeGen/ExecutionDepsFix.cpp
index 050edce..300f037 100644
--- a/lib/CodeGen/ExecutionDepsFix.cpp
+++ b/lib/CodeGen/ExecutionDepsFix.cpp
@@ -600,6 +600,9 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   while (!Regs.empty()) {
     if (!dv) {
       dv = Regs.pop_back_val().Value;
+      // Force the first dv to match the current instruction.
+      dv->AvailableDomains = dv->getCommonDomains(available);
+      assert(dv->AvailableDomains && "Domain should have been filtered");
       continue;
     }
 
@@ -617,9 +620,10 @@ void ExeDepsFix::visitSoftInstr(MachineInstr *mi, unsigned mask) {
   }
 
   // dv is the DomainValue we are going to use for this instruction.
-  if (!dv)
+  if (!dv) {
     dv = alloc();
-  dv->AvailableDomains = available;
+    dv->AvailableDomains = available;
+  }
   dv->Instrs.push_back(mi);
 
   // Finally set all defs and non-collapsed uses to dv.
@@ -650,10 +654,11 @@ bool ExeDepsFix::runOnMachineFunction(MachineFunction &mf) {
   bool anyregs = false;
   for (TargetRegisterClass::const_iterator I = RC->begin(), E = RC->end();
        I != E; ++I)
-    if (MF->getRegInfo().isPhysRegUsed(*I)) {
-      anyregs = true;
-      break;
-    }
+    for (const unsigned *AI = TRI->getOverlaps(*I); *AI; ++AI)
+      if (MF->getRegInfo().isPhysRegUsed(*AI)) {
+        anyregs = true;
+        break;
+      }
   if (!anyregs) return false;
 
   // Initialize the AliasMap on the first use.
diff --git a/lib/CodeGen/GCMetadata.cpp b/lib/CodeGen/GCMetadata.cpp
index d757cf4..0281d05 100644
--- a/lib/CodeGen/GCMetadata.cpp
+++ b/lib/CodeGen/GCMetadata.cpp
@@ -156,12 +156,12 @@ bool Printer::runOnFunction(Function &F) {
   
   GCFunctionInfo *FD = &getAnalysis<GCModuleInfo>().getFunctionInfo(F);
   
-  OS << "GC roots for " << FD->getFunction().getNameStr() << ":\n";
+  OS << "GC roots for " << FD->getFunction().getName() << ":\n";
   for (GCFunctionInfo::roots_iterator RI = FD->roots_begin(),
                                       RE = FD->roots_end(); RI != RE; ++RI)
     OS << "\t" << RI->Num << "\t" << RI->StackOffset << "[sp]\n";
   
-  OS << "GC safe points for " << FD->getFunction().getNameStr() << ":\n";
+  OS << "GC safe points for " << FD->getFunction().getName() << ":\n";
   for (GCFunctionInfo::iterator PI = FD->begin(),
                                 PE = FD->end(); PI != PE; ++PI) {
     
diff --git a/lib/CodeGen/LLVMTargetMachine.cpp b/lib/CodeGen/LLVMTargetMachine.cpp
index 3e69069..03b5693 100644
--- a/lib/CodeGen/LLVMTargetMachine.cpp
+++ b/lib/CodeGen/LLVMTargetMachine.cpp
@@ -114,9 +114,10 @@ EnableFastISelOption("fast-isel", cl::Hidden,
 
 LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
                                      StringRef CPU, StringRef FS,
-                                     Reloc::Model RM, CodeModel::Model CM)
+                                     Reloc::Model RM, CodeModel::Model CM,
+                                     CodeGenOpt::Level OL)
   : TargetMachine(T, Triple, CPU, FS) {
-  CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM);
+  CodeGenInfo = T.createMCCodeGenInfo(Triple, RM, CM, OL);
   AsmInfo = T.createMCAsmInfo(Triple);
   // TargetSelect.h moved to a different directory between LLVM 2.9 and 3.0,
   // and if the old one gets included then MCAsmInfo will be NULL and
@@ -130,11 +131,10 @@ LLVMTargetMachine::LLVMTargetMachine(const Target &T, StringRef Triple,
 bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                             formatted_raw_ostream &Out,
                                             CodeGenFileType FileType,
-                                            CodeGenOpt::Level OptLevel,
                                             bool DisableVerify) {
   // Add common CodeGen passes.
   MCContext *Context = 0;
-  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Context))
+  if (addCommonCodeGenPasses(PM, DisableVerify, Context))
     return true;
   assert(Context != 0 && "Failed to get MCContext");
 
@@ -219,14 +219,13 @@ bool LLVMTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 ///
 bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
                                                    JITCodeEmitter &JCE,
-                                                   CodeGenOpt::Level OptLevel,
                                                    bool DisableVerify) {
   // Add common CodeGen passes.
   MCContext *Ctx = 0;
-  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Ctx))
+  if (addCommonCodeGenPasses(PM, DisableVerify, Ctx))
     return true;
 
-  addCodeEmitter(PM, OptLevel, JCE);
+  addCodeEmitter(PM, JCE);
   PM.add(createGCInfoDeleter());
 
   return false; // success!
@@ -240,10 +239,9 @@ bool LLVMTargetMachine::addPassesToEmitMachineCode(PassManagerBase &PM,
 bool LLVMTargetMachine::addPassesToEmitMC(PassManagerBase &PM,
                                           MCContext *&Ctx,
                                           raw_ostream &Out,
-                                          CodeGenOpt::Level OptLevel,
                                           bool DisableVerify) {
   // Add common CodeGen passes.
-  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Ctx))
+  if (addCommonCodeGenPasses(PM, DisableVerify, Ctx))
     return true;
 
   if (hasMCSaveTempLabels())
@@ -295,7 +293,6 @@ static void printAndVerify(PassManagerBase &PM,
 /// emitting to assembly files or machine code output.
 ///
 bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
-                                               CodeGenOpt::Level OptLevel,
                                                bool DisableVerify,
                                                MCContext *&OutContext) {
   // Standard LLVM-Level Passes.
@@ -313,7 +310,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     PM.add(createVerifierPass());
 
   // Run loop strength reduction before anything else.
-  if (OptLevel != CodeGenOpt::None && !DisableLSR) {
+  if (getOptLevel() != CodeGenOpt::None && !DisableLSR) {
     PM.add(createLoopStrengthReducePass(getTargetLowering()));
     if (PrintLSR)
       PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
@@ -349,12 +346,12 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     break;
   }
 
-  if (OptLevel != CodeGenOpt::None && !DisableCGP)
+  if (getOptLevel() != CodeGenOpt::None && !DisableCGP)
     PM.add(createCodeGenPreparePass(getTargetLowering()));
 
   PM.add(createStackProtectorPass(getTargetLowering()));
 
-  addPreISel(PM, OptLevel);
+  addPreISel(PM);
 
   if (PrintISelInput)
     PM.add(createPrintFunctionPass("\n\n"
@@ -377,15 +374,16 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
-  PM.add(new MachineFunctionAnalysis(*this, OptLevel));
+  PM.add(new MachineFunctionAnalysis(*this));
 
   // Enable FastISel with -fast, but allow that to be overridden.
   if (EnableFastISelOption == cl::BOU_TRUE ||
-      (OptLevel == CodeGenOpt::None && EnableFastISelOption != cl::BOU_FALSE))
+      (getOptLevel() == CodeGenOpt::None &&
+       EnableFastISelOption != cl::BOU_FALSE))
     EnableFastISel = true;
 
   // Ask the target for an isel.
-  if (addInstSelector(PM, OptLevel))
+  if (addInstSelector(PM))
     return true;
 
   // Print the instruction selected machine code...
@@ -395,21 +393,21 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   PM.add(createExpandISelPseudosPass());
 
   // Pre-ra tail duplication.
-  if (OptLevel != CodeGenOpt::None && !DisableEarlyTailDup) {
+  if (getOptLevel() != CodeGenOpt::None && !DisableEarlyTailDup) {
     PM.add(createTailDuplicatePass(true));
     printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
   }
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  if (OptLevel != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None)
     PM.add(createOptimizePHIsPass());
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
   PM.add(createLocalStackSlotAllocationPass());
 
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     // With optimization, dead code should already be eliminated. However
     // there is one known exception: lowered code for arguments that are only
     // used by tail calls, where the tail calls reuse the incoming stack
@@ -431,15 +429,15 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   }
 
   // Run pre-ra passes.
-  if (addPreRegAlloc(PM, OptLevel))
+  if (addPreRegAlloc(PM))
     printAndVerify(PM, "After PreRegAlloc passes");
 
   // Perform register allocation.
-  PM.add(createRegisterAllocator(OptLevel));
+  PM.add(createRegisterAllocator(getOptLevel()));
   printAndVerify(PM, "After Register Allocation");
 
   // Perform stack slot coloring and post-ra machine LICM.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     // FIXME: Re-enable coloring with register when it's capable of adding
     // kill markers.
     if (!DisableSSC)
@@ -453,7 +451,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   }
 
   // Run post-ra passes.
-  if (addPostRegAlloc(PM, OptLevel))
+  if (addPostRegAlloc(PM))
     printAndVerify(PM, "After PostRegAlloc passes");
 
   PM.add(createExpandPostRAPseudosPass());
@@ -464,23 +462,23 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   printAndVerify(PM, "After PrologEpilogCodeInserter");
 
   // Run pre-sched2 passes.
-  if (addPreSched2(PM, OptLevel))
+  if (addPreSched2(PM))
     printAndVerify(PM, "After PreSched2 passes");
 
   // Second pass scheduler.
-  if (OptLevel != CodeGenOpt::None && !DisablePostRA) {
-    PM.add(createPostRAScheduler(OptLevel));
+  if (getOptLevel() != CodeGenOpt::None && !DisablePostRA) {
+    PM.add(createPostRAScheduler(getOptLevel()));
     printAndVerify(PM, "After PostRAScheduler");
   }
 
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (OptLevel != CodeGenOpt::None && !DisableBranchFold) {
+  if (getOptLevel() != CodeGenOpt::None && !DisableBranchFold) {
     PM.add(createBranchFoldingPass(getEnableTailMergeDefault()));
     printNoVerify(PM, "After BranchFolding");
   }
 
   // Tail duplication.
-  if (OptLevel != CodeGenOpt::None && !DisableTailDuplicate) {
+  if (getOptLevel() != CodeGenOpt::None && !DisableTailDuplicate) {
     PM.add(createTailDuplicatePass(false));
     printNoVerify(PM, "After TailDuplicate");
   }
@@ -490,7 +488,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   if (PrintGCInfo)
     PM.add(createGCInfoPrinter(dbgs()));
 
-  if (OptLevel != CodeGenOpt::None && !DisableCodePlace) {
+  if (getOptLevel() != CodeGenOpt::None && !DisableCodePlace) {
     if (EnableBlockPlacement) {
       // MachineBlockPlacement is an experimental pass which is disabled by
       // default currently. Eventually it should subsume CodePlacementOpt, so
@@ -509,7 +507,7 @@ bool LLVMTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     }
   }
 
-  if (addPreEmitPass(PM, OptLevel))
+  if (addPreEmitPass(PM))
     printNoVerify(PM, "After PreEmit passes");
 
   return false;
diff --git a/lib/CodeGen/MachineBasicBlock.cpp b/lib/CodeGen/MachineBasicBlock.cpp
index 4c5fe4c..b9d1ef7 100644
--- a/lib/CodeGen/MachineBasicBlock.cpp
+++ b/lib/CodeGen/MachineBasicBlock.cpp
@@ -297,8 +297,22 @@ void MachineBasicBlock::updateTerminator() {
         TII->RemoveBranch(*this);
     } else {
       // The block has an unconditional fallthrough. If its successor is not
-      // its layout successor, insert a branch.
-      TBB = *succ_begin();
+      // its layout successor, insert a branch. First we have to locate the
+      // only non-landing-pad successor, as that is the fallthrough block.
+      for (succ_iterator SI = succ_begin(), SE = succ_end(); SI != SE; ++SI) {
+        if ((*SI)->isLandingPad())
+          continue;
+        assert(!TBB && "Found more than one non-landing-pad successor!");
+        TBB = *SI;
+      }
+
+      // If there is no non-landing-pad successor, the block has no
+      // fall-through edges to be concerned with.
+      if (!TBB)
+        return;
+
+      // Finally update the unconditional successor to be reached via a branch
+      // if it would not be reached by fallthrough.
       if (!isLayoutSuccessor(TBB))
         TII->InsertBranch(*this, TBB, 0, Cond, dl);
     }
diff --git a/lib/CodeGen/MachineBlockPlacement.cpp b/lib/CodeGen/MachineBlockPlacement.cpp
index 304f167..55d804b 100644
--- a/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/lib/CodeGen/MachineBlockPlacement.cpp
@@ -214,11 +214,12 @@ class MachineBlockPlacement : public MachineFunctionPass {
   MachineBasicBlock *selectBestCandidateBlock(
       BlockChain &Chain, SmallVectorImpl<MachineBasicBlock *> &WorkList,
       const BlockFilterSet *BlockFilter);
-  MachineBasicBlock *getFirstUnplacedBlock(const BlockChain &PlacedChain,
-                                           ArrayRef<MachineBasicBlock *> Blocks,
-                                           unsigned &PrevUnplacedBlockIdx);
+  MachineBasicBlock *getFirstUnplacedBlock(
+      MachineFunction &F,
+      const BlockChain &PlacedChain,
+      MachineFunction::iterator &PrevUnplacedBlockIt,
+      const BlockFilterSet *BlockFilter);
   void buildChain(MachineBasicBlock *BB, BlockChain &Chain,
-                  ArrayRef<MachineBasicBlock *> Blocks,
                   SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
                   const BlockFilterSet *BlockFilter = 0);
   void buildLoopChains(MachineFunction &F, MachineLoop &L);
@@ -314,7 +315,7 @@ void MachineBlockPlacement::markChainSuccessors(
       // This is a cross-chain edge that is within the loop, so decrement the
       // loop predecessor count of the destination chain.
       if (SuccChain.LoopPredecessors > 0 && --SuccChain.LoopPredecessors == 0)
-        BlockWorkList.push_back(*SI);
+        BlockWorkList.push_back(*SuccChain.begin());
     }
   }
 }
@@ -354,15 +355,45 @@ MachineBasicBlock *MachineBlockPlacement::selectBestSuccessor(
       DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Already merged!\n");
       continue;
     }
+    if (*SI != *SuccChain.begin()) {
+      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> Mid chain!\n");
+      continue;
+    }
 
     uint32_t SuccWeight = MBPI->getEdgeWeight(BB, *SI);
     BranchProbability SuccProb(SuccWeight / WeightScale, SumWeight);
 
     // Only consider successors which are either "hot", or wouldn't violate
     // any CFG constraints.
-    if (SuccChain.LoopPredecessors != 0 && SuccProb < HotProb) {
-      DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
-      continue;
+    if (SuccChain.LoopPredecessors != 0) {
+      if (SuccProb < HotProb) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> CFG conflict\n");
+        continue;
+      }
+
+      // Make sure that a hot successor doesn't have a globally more important
+      // predecessor.
+      BlockFrequency CandidateEdgeFreq
+        = MBFI->getBlockFreq(BB) * SuccProb * HotProb.getCompl();
+      bool BadCFGConflict = false;
+      for (MachineBasicBlock::pred_iterator PI = (*SI)->pred_begin(),
+                                            PE = (*SI)->pred_end();
+           PI != PE; ++PI) {
+        if (*PI == *SI || (BlockFilter && !BlockFilter->count(*PI)) ||
+            BlockToChain[*PI] == &Chain)
+          continue;
+        BlockFrequency PredEdgeFreq
+          = MBFI->getBlockFreq(*PI) * MBPI->getEdgeProbability(*PI, *SI);
+        if (PredEdgeFreq >= CandidateEdgeFreq) {
+          BadCFGConflict = true;
+          break;
+        }
+      }
+      if (BadCFGConflict) {
+        DEBUG(dbgs() << "    " << getBlockName(*SI)
+                               << " -> non-cold CFG conflict\n");
+        continue;
+      }
     }
 
     DEBUG(dbgs() << "    " << getBlockName(*SI) << " -> " << SuccProb
@@ -444,18 +475,23 @@ MachineBasicBlock *MachineBlockPlacement::selectBestCandidateBlock(
 ///
 /// This routine is called when we are unable to use the CFG to walk through
 /// all of the basic blocks and form a chain due to unnatural loops in the CFG.
-/// We walk through the sequence of blocks, starting from the
-/// LastUnplacedBlockIdx. We update this index to avoid re-scanning the entire
-/// sequence on repeated calls to this routine.
+/// We walk through the function's blocks in order, starting from the
+/// LastUnplacedBlockIt. We update this iterator on each call to avoid
+/// re-scanning the entire sequence on repeated calls to this routine.
 MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
-    const BlockChain &PlacedChain,
-    ArrayRef<MachineBasicBlock *> Blocks,
-    unsigned &PrevUnplacedBlockIdx) {
-  for (unsigned i = PrevUnplacedBlockIdx, e = Blocks.size(); i != e; ++i) {
-    MachineBasicBlock *BB = Blocks[i];
-    if (BlockToChain[BB] != &PlacedChain) {
-      PrevUnplacedBlockIdx = i;
-      return BB;
+    MachineFunction &F, const BlockChain &PlacedChain,
+    MachineFunction::iterator &PrevUnplacedBlockIt,
+    const BlockFilterSet *BlockFilter) {
+  for (MachineFunction::iterator I = PrevUnplacedBlockIt, E = F.end(); I != E;
+       ++I) {
+    if (BlockFilter && !BlockFilter->count(I))
+      continue;
+    if (BlockToChain[I] != &PlacedChain) {
+      PrevUnplacedBlockIt = I;
+      // Now select the head of the chain to which the unplaced block belongs
+      // as the block to place. This will force the entire chain to be placed,
+      // and satisfies the requirements of merging chains.
+      return *BlockToChain[I]->begin();
     }
   }
   return 0;
@@ -464,14 +500,12 @@ MachineBasicBlock *MachineBlockPlacement::getFirstUnplacedBlock(
 void MachineBlockPlacement::buildChain(
     MachineBasicBlock *BB,
     BlockChain &Chain,
-    ArrayRef<MachineBasicBlock *> Blocks,
     SmallVectorImpl<MachineBasicBlock *> &BlockWorkList,
     const BlockFilterSet *BlockFilter) {
   assert(BB);
   assert(BlockToChain[BB] == &Chain);
-  assert(*Chain.begin() == BB);
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
-  unsigned PrevUnplacedBlockIdx = 0;
+  MachineFunction &F = *BB->getParent();
+  MachineFunction::iterator PrevUnplacedBlockIt = F.begin();
 
   MachineBasicBlock *LoopHeaderBB = BB;
   markChainSuccessors(Chain, LoopHeaderBB, BlockWorkList, BlockFilter);
@@ -482,26 +516,9 @@ void MachineBlockPlacement::buildChain(
     assert(*llvm::prior(Chain.end()) == BB);
     MachineBasicBlock *BestSucc = 0;
 
-    // Check for unreasonable branches, and forcibly merge the existing layout
-    // successor for them. We can handle cases that AnalyzeBranch can't: jump
-    // tables etc are fine. The case we want to handle specially is when there
-    // is potential fallthrough, but the branch cannot be analyzed. This
-    // includes blocks without terminators as well as other cases.
-    Cond.clear();
-    MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
-    if (TII->AnalyzeBranch(*BB, TBB, FBB, Cond) && BB->canFallThrough()) {
-      MachineFunction::iterator I(BB), NextI(llvm::next(I));
-      // Ensure that the layout successor is a viable block, as we know that
-      // fallthrough is a possibility. Note that this may not be a valid block
-      // in the loop, but we allow that to cope with degenerate situations.
-      assert(NextI != BB->getParent()->end());
-      BestSucc = NextI;
-    }
-
-    // Otherwise, look for the best viable successor if there is one to place
-    // immediately after this block.
-    if (!BestSucc)
-      BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
+    // Look for the best viable successor if there is one to place immediately
+    // after this block.
+    BestSucc = selectBestSuccessor(BB, Chain, BlockFilter);
 
     // If an immediate successor isn't available, look for the best viable
     // block among those we've identified as not violating the loop's CFG at
@@ -510,7 +527,8 @@ void MachineBlockPlacement::buildChain(
       BestSucc = selectBestCandidateBlock(Chain, BlockWorkList, BlockFilter);
 
     if (!BestSucc) {
-      BestSucc = getFirstUnplacedBlock(Chain, Blocks, PrevUnplacedBlockIdx);
+      BestSucc = getFirstUnplacedBlock(F, Chain, PrevUnplacedBlockIt,
+                                       BlockFilter);
       if (!BestSucc)
         break;
 
@@ -576,11 +594,10 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
     }
 
     if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(*BI);
+      BlockWorkList.push_back(*Chain.begin());
   }
 
-  buildChain(*L.block_begin(), LoopChain, L.getBlocks(), BlockWorkList,
-             &LoopBlockSet);
+  buildChain(*L.block_begin(), LoopChain, BlockWorkList, &LoopBlockSet);
 
   DEBUG({
     // Crash at the end so we get all of the debugging output first.
@@ -596,8 +613,7 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
       if (!LoopBlockSet.erase(*BCI)) {
         // We don't mark the loop as bad here because there are real situations
         // where this can occur. For example, with an unanalyzable fallthrough
-        // from a loop block to a non-loop block.
-        // FIXME: Such constructs shouldn't exist. Track them down and fix them.
+        // from a loop block to a non-loop block or vice versa.
         dbgs() << "Loop chain contains a block not contained by the loop!\n"
                << "  Loop header:  " << getBlockName(*L.block_begin()) << "\n"
                << "  Chain header: " << getBlockName(*LoopChain.begin()) << "\n"
@@ -621,26 +637,43 @@ void MachineBlockPlacement::buildLoopChains(MachineFunction &F,
 void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
   // Ensure that every BB in the function has an associated chain to simplify
   // the assumptions of the remaining algorithm.
-  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI)
-    BlockToChain[&*FI] =
-      new (ChainAllocator.Allocate()) BlockChain(BlockToChain, &*FI);
+  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
+  for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
+    MachineBasicBlock *BB = FI;
+    BlockChain *Chain
+      = new (ChainAllocator.Allocate()) BlockChain(BlockToChain, BB);
+    // Also, merge any blocks which we cannot reason about and must preserve
+    // the exact fallthrough behavior for.
+    for (;;) {
+      Cond.clear();
+      MachineBasicBlock *TBB = 0, *FBB = 0; // For AnalyzeBranch.
+      if (!TII->AnalyzeBranch(*BB, TBB, FBB, Cond) || !FI->canFallThrough())
+        break;
+
+      MachineFunction::iterator NextFI(llvm::next(FI));
+      MachineBasicBlock *NextBB = NextFI;
+      // Ensure that the layout successor is a viable block, as we know that
+      // fallthrough is a possibility.
+      assert(NextFI != FE && "Can't fallthrough past the last block.");
+      DEBUG(dbgs() << "Pre-merging due to unanalyzable fallthrough: "
+                   << getBlockName(BB) << " -> " << getBlockName(NextBB)
+                   << "\n");
+      Chain->merge(NextBB, 0);
+      FI = NextFI;
+      BB = NextBB;
+    }
+  }
 
   // Build any loop-based chains.
   for (MachineLoopInfo::iterator LI = MLI->begin(), LE = MLI->end(); LI != LE;
        ++LI)
     buildLoopChains(F, **LI);
 
-  // We need a vector of blocks so that buildChain can handle unnatural CFG
-  // constructs by searching for unplaced blocks and just concatenating them.
-  SmallVector<MachineBasicBlock *, 16> Blocks;
-  Blocks.reserve(F.size());
-
   SmallVector<MachineBasicBlock *, 16> BlockWorkList;
 
   SmallPtrSet<BlockChain *, 4> UpdatedPreds;
   for (MachineFunction::iterator FI = F.begin(), FE = F.end(); FI != FE; ++FI) {
     MachineBasicBlock *BB = &*FI;
-    Blocks.push_back(BB);
     BlockChain &Chain = *BlockToChain[BB];
     if (!UpdatedPreds.insert(&Chain))
       continue;
@@ -659,11 +692,11 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
     }
 
     if (Chain.LoopPredecessors == 0)
-      BlockWorkList.push_back(BB);
+      BlockWorkList.push_back(*Chain.begin());
   }
 
   BlockChain &FunctionChain = *BlockToChain[&F.front()];
-  buildChain(&F.front(), FunctionChain, Blocks, BlockWorkList);
+  buildChain(&F.front(), FunctionChain, BlockWorkList);
 
   typedef SmallPtrSet<MachineBasicBlock *, 16> FunctionBlockSetType;
   DEBUG({
@@ -695,7 +728,6 @@ void MachineBlockPlacement::buildCFGChains(MachineFunction &F) {
 
   // Splice the blocks into place.
   MachineFunction::iterator InsertPos = F.begin();
-  SmallVector<MachineOperand, 4> Cond; // For AnalyzeBranch.
   for (BlockChain::iterator BI = FunctionChain.begin(),
                             BE = FunctionChain.end();
        BI != BE; ++BI) {
diff --git a/lib/CodeGen/MachineFunction.cpp b/lib/CodeGen/MachineFunction.cpp
index 20066a0..0c89a57 100644
--- a/lib/CodeGen/MachineFunction.cpp
+++ b/lib/CodeGen/MachineFunction.cpp
@@ -335,7 +335,7 @@ namespace llvm {
   DOTGraphTraits (bool isSimple=false) : DefaultDOTGraphTraits(isSimple) {}
 
     static std::string getGraphName(const MachineFunction *F) {
-      return "CFG for '" + F->getFunction()->getNameStr() + "' function";
+      return "CFG for '" + F->getFunction()->getName().str() + "' function";
     }
 
     std::string getNodeLabel(const MachineBasicBlock *Node,
@@ -368,7 +368,7 @@ namespace llvm {
 void MachineFunction::viewCFG() const
 {
 #ifndef NDEBUG
-  ViewGraph(this, "mf" + getFunction()->getNameStr());
+  ViewGraph(this, "mf" + getFunction()->getName());
 #else
   errs() << "MachineFunction::viewCFG is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
@@ -378,7 +378,7 @@ void MachineFunction::viewCFG() const
 void MachineFunction::viewCFGOnly() const
 {
 #ifndef NDEBUG
-  ViewGraph(this, "mf" + getFunction()->getNameStr(), true);
+  ViewGraph(this, "mf" + getFunction()->getName(), true);
 #else
   errs() << "MachineFunction::viewCFGOnly is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
diff --git a/lib/CodeGen/MachineFunctionAnalysis.cpp b/lib/CodeGen/MachineFunctionAnalysis.cpp
index 054c750..35591e1 100644
--- a/lib/CodeGen/MachineFunctionAnalysis.cpp
+++ b/lib/CodeGen/MachineFunctionAnalysis.cpp
@@ -19,9 +19,8 @@ using namespace llvm;
 
 char MachineFunctionAnalysis::ID = 0;
 
-MachineFunctionAnalysis::MachineFunctionAnalysis(const TargetMachine &tm,
-                                                 CodeGenOpt::Level OL) :
-  FunctionPass(ID), TM(tm), OptLevel(OL), MF(0) {
+MachineFunctionAnalysis::MachineFunctionAnalysis(const TargetMachine &tm) :
+  FunctionPass(ID), TM(tm), MF(0) {
   initializeMachineModuleInfoPass(*PassRegistry::getPassRegistry());
 }
 
diff --git a/lib/CodeGen/MachineLICM.cpp b/lib/CodeGen/MachineLICM.cpp
index e756ded..e5e8c51 100644
--- a/lib/CodeGen/MachineLICM.cpp
+++ b/lib/CodeGen/MachineLICM.cpp
@@ -670,7 +670,7 @@ MachineLICM::getRegisterClassIDAndCost(const MachineInstr *MI,
                                        unsigned &RCId, unsigned &RCCost) const {
   const TargetRegisterClass *RC = MRI->getRegClass(Reg);
   EVT VT = *RC->vt_begin();
-  if (VT == MVT::untyped) {
+  if (VT == MVT::Untyped) {
     RCId = RC->getID();
     RCCost = 1;
   } else {
diff --git a/lib/CodeGen/MachineVerifier.cpp b/lib/CodeGen/MachineVerifier.cpp
index b3c28b0..f231e3c 100644
--- a/lib/CodeGen/MachineVerifier.cpp
+++ b/lib/CodeGen/MachineVerifier.cpp
@@ -320,7 +320,7 @@ void MachineVerifier::report(const char *msg, const MachineFunction *MF) {
     MF->print(*OS, Indexes);
   }
   *OS << "*** Bad machine code: " << msg << " ***\n"
-      << "- function:    " << MF->getFunction()->getNameStr() << "\n";
+      << "- function:    " << MF->getFunction()->getName() << "\n";
 }
 
 void MachineVerifier::report(const char *msg, const MachineBasicBlock *MBB) {
diff --git a/lib/CodeGen/PostRASchedulerList.cpp b/lib/CodeGen/PostRASchedulerList.cpp
index c73e877..7205ed6 100644
--- a/lib/CodeGen/PostRASchedulerList.cpp
+++ b/lib/CodeGen/PostRASchedulerList.cpp
@@ -248,8 +248,8 @@ bool PostRAScheduler::runOnMachineFunction(MachineFunction &Fn) {
       static int bbcnt = 0;
       if (bbcnt++ % DebugDiv != DebugMod)
         continue;
-      dbgs() << "*** DEBUG scheduling " << Fn.getFunction()->getNameStr() <<
-        ":BB#" << MBB->getNumber() << " ***\n";
+      dbgs() << "*** DEBUG scheduling " << Fn.getFunction()->getName()
+             << ":BB#" << MBB->getNumber() << " ***\n";
     }
 #endif
 
diff --git a/lib/CodeGen/RegAllocFast.cpp b/lib/CodeGen/RegAllocFast.cpp
index b36a445..4664a3c 100644
--- a/lib/CodeGen/RegAllocFast.cpp
+++ b/lib/CodeGen/RegAllocFast.cpp
@@ -682,7 +682,7 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
   }
 
   SmallVector<unsigned, 8> PartialDefs;
-  DEBUG(dbgs() << "Allocating tied uses and early clobbers.\n");
+  DEBUG(dbgs() << "Allocating tied uses.\n");
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     MachineOperand &MO = MI->getOperand(i);
     if (!MO.isReg()) continue;
@@ -704,15 +704,24 @@ void RAFast::handleThroughOperands(MachineInstr *MI,
       // That would confuse the later phys-def processing pass.
       LiveRegMap::iterator LRI = reloadVirtReg(MI, i, Reg, 0);
       PartialDefs.push_back(LRI->second.PhysReg);
-    } else if (MO.isEarlyClobber()) {
-      // Note: defineVirtReg may invalidate MO.
-      LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
-      unsigned PhysReg = LRI->second.PhysReg;
-      if (setPhysReg(MI, i, PhysReg))
-        VirtDead.push_back(Reg);
     }
   }
 
+  DEBUG(dbgs() << "Allocating early clobbers.\n");
+  for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
+    MachineOperand &MO = MI->getOperand(i);
+    if (!MO.isReg()) continue;
+    unsigned Reg = MO.getReg();
+    if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
+    if (!MO.isEarlyClobber())
+      continue;
+    // Note: defineVirtReg may invalidate MO.
+    LiveRegMap::iterator LRI = defineVirtReg(MI, i, Reg, 0);
+    unsigned PhysReg = LRI->second.PhysReg;
+    if (setPhysReg(MI, i, PhysReg))
+      VirtDead.push_back(Reg);
+  }
+
   // Restore UsedInInstr to a state usable for allocating normal virtual uses.
   UsedInInstr.reset();
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
@@ -815,7 +824,6 @@ void RAFast::AllocateBasicBlock() {
           if (!MO.isReg()) continue;
           unsigned Reg = MO.getReg();
           if (!TargetRegisterInfo::isVirtualRegister(Reg)) continue;
-          LiveDbgValueMap[Reg].push_back(MI);
           LiveRegMap::iterator LRI = LiveVirtRegs.find(Reg);
           if (LRI != LiveVirtRegs.end())
             setPhysReg(MI, i, LRI->second.PhysReg);
@@ -849,6 +857,7 @@ void RAFast::AllocateBasicBlock() {
               }
             }
           }
+          LiveDbgValueMap[Reg].push_back(MI);
         }
       }
       // Next instruction.
diff --git a/lib/CodeGen/ScheduleDAGPrinter.cpp b/lib/CodeGen/ScheduleDAGPrinter.cpp
index 4b55a22..6304243 100644
--- a/lib/CodeGen/ScheduleDAGPrinter.cpp
+++ b/lib/CodeGen/ScheduleDAGPrinter.cpp
@@ -86,12 +86,12 @@ void ScheduleDAG::viewGraph() {
 // This code is only for debugging!
 #ifndef NDEBUG
   if (BB->getBasicBlock())
-    ViewGraph(this, "dag." + MF.getFunction()->getNameStr(), false,
-              "Scheduling-Units Graph for " + MF.getFunction()->getNameStr() + 
-              ":" + BB->getBasicBlock()->getNameStr());
+    ViewGraph(this, "dag." + MF.getFunction()->getName(), false,
+              "Scheduling-Units Graph for " + MF.getFunction()->getName() +
+              ":" + BB->getBasicBlock()->getName());
   else
-    ViewGraph(this, "dag." + MF.getFunction()->getNameStr(), false,
-              "Scheduling-Units Graph for " + MF.getFunction()->getNameStr());
+    ViewGraph(this, "dag." + MF.getFunction()->getName(), false,
+              "Scheduling-Units Graph for " + MF.getFunction()->getName());
 #else
   errs() << "ScheduleDAG::viewGraph is only available in debug builds on "
          << "systems with Graphviz or gv!\n";
diff --git a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 4384db8..d8208a4 100644
--- a/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22,7 +22,6 @@
 #include "llvm/LLVMContext.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Analysis/AliasAnalysis.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
@@ -6937,10 +6936,23 @@ SDValue DAGCombiner::visitEXTRACT_VECTOR_ELT(SDNode *N) {
                            DAG.getConstant(PtrOff, PtrType));
     }
 
-    return DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
-                       LN0->getPointerInfo().getWithOffset(PtrOff),
-                       LN0->isVolatile(), LN0->isNonTemporal(), 
-                       LN0->isInvariant(), Align);
+    // The replacement we need to do here is a little tricky: we need to
+    // replace an extractelement of a load with a load.
+    // Use ReplaceAllUsesOfValuesWith to do the replacement.
+    SDValue Load = DAG.getLoad(LVT, N->getDebugLoc(), LN0->getChain(), NewPtr,
+                               LN0->getPointerInfo().getWithOffset(PtrOff),
+                               LN0->isVolatile(), LN0->isNonTemporal(), 
+                               LN0->isInvariant(), Align);
+    WorkListRemover DeadNodes(*this);
+    SDValue From[] = { SDValue(N, 0), SDValue(LN0,1) };
+    SDValue To[] = { Load.getValue(0), Load.getValue(1) };
+    DAG.ReplaceAllUsesOfValuesWith(From, To, 2, &DeadNodes);
+    // Since we're explcitly calling ReplaceAllUses, add the new node to the
+    // worklist explicitly as well.
+    AddToWorkList(Load.getNode());
+    // Make sure to revisit this node to clean it up; it will usually be dead.
+    AddToWorkList(N);
+    return SDValue(N, 0);
   }
 
   return SDValue();
diff --git a/lib/CodeGen/SelectionDAG/FastISel.cpp b/lib/CodeGen/SelectionDAG/FastISel.cpp
index e8f8c73..cff37c2 100644
--- a/lib/CodeGen/SelectionDAG/FastISel.cpp
+++ b/lib/CodeGen/SelectionDAG/FastISel.cpp
@@ -39,6 +39,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#define DEBUG_TYPE "isel"
 #include "llvm/Function.h"
 #include "llvm/GlobalVariable.h"
 #include "llvm/Instructions.h"
@@ -58,8 +59,12 @@
 #include "llvm/Target/TargetMachine.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/Debug.h"
+#include "llvm/ADT/Statistic.h"
 using namespace llvm;
 
+STATISTIC(NumFastIselSuccessIndependent, "Number of insts selected by target-independent selector");
+STATISTIC(NumFastIselSuccessTarget, "Number of insts selected by target-specific selector");
+
 /// startNewBlock - Set the current block to which generated machine
 /// instructions will be appended, and clear the local CSE map.
 ///
@@ -96,6 +101,11 @@ bool FastISel::hasTrivialKill(const Value *V) const {
         !hasTrivialKill(Cast->getOperand(0)))
       return false;
 
+  // GEPs with all zero indices are trivially coalesced by fast-isel.
+  if (const GetElementPtrInst *GEP = dyn_cast<GetElementPtrInst>(I))
+    if (GEP->hasAllZeroIndices() && !hasTrivialKill(GEP->getOperand(0)))
+      return false;
+
   // Only instructions with a single use in the same basic block are considered
   // to have trivial kills.
   return I->hasOneUse() &&
@@ -427,6 +437,11 @@ bool FastISel::SelectGetElementPtr(const User *I) {
 
   bool NIsKill = hasTrivialKill(I->getOperand(0));
 
+  // Keep a running tab of the total offset to coalesce multiple N = N + Offset
+  // into a single N = N + TotalOffset.
+  uint64_t TotalOffs = 0;
+  // FIXME: What's a good SWAG number for MaxOffs?
+  uint64_t MaxOffs = 2048;
   Type *Ty = I->getOperand(0)->getType();
   MVT VT = TLI.getPointerTy();
   for (GetElementPtrInst::const_op_iterator OI = I->op_begin()+1,
@@ -436,14 +451,15 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       unsigned Field = cast<ConstantInt>(Idx)->getZExtValue();
       if (Field) {
         // N = N + Offset
-        uint64_t Offs = TD.getStructLayout(StTy)->getElementOffset(Field);
-        // FIXME: This can be optimized by combining the add with a
-        // subsequent one.
-        N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, Offs, VT);
-        if (N == 0)
-          // Unhandled operand. Halt "fast" selection and bail.
-          return false;
-        NIsKill = true;
+        TotalOffs += TD.getStructLayout(StTy)->getElementOffset(Field);
+        if (TotalOffs >= MaxOffs) {
+          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (N == 0)
+            // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          NIsKill = true;
+          TotalOffs = 0;
+        }
       }
       Ty = StTy->getElementType(Field);
     } else {
@@ -452,14 +468,26 @@ bool FastISel::SelectGetElementPtr(const User *I) {
       // If this is a constant subscript, handle it quickly.
       if (const ConstantInt *CI = dyn_cast<ConstantInt>(Idx)) {
         if (CI->isZero()) continue;
-        uint64_t Offs =
+        // N = N + Offset
+        TotalOffs += 
           TD.getTypeAllocSize(Ty)*cast<ConstantInt>(CI)->getSExtValue();
-        N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, Offs, VT);
+        if (TotalOffs >= MaxOffs) {
+          N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+          if (N == 0)
+            // Unhandled operand. Halt "fast" selection and bail.
+            return false;
+          NIsKill = true;
+          TotalOffs = 0;
+        }
+        continue;
+      }
+      if (TotalOffs) {
+        N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
         if (N == 0)
           // Unhandled operand. Halt "fast" selection and bail.
           return false;
         NIsKill = true;
-        continue;
+        TotalOffs = 0;
       }
 
       // N = N + Idx * ElementSize;
@@ -484,6 +512,12 @@ bool FastISel::SelectGetElementPtr(const User *I) {
         return false;
     }
   }
+  if (TotalOffs) {
+    N = FastEmit_ri_(VT, ISD::ADD, N, NIsKill, TotalOffs, VT);
+    if (N == 0)
+      // Unhandled operand. Halt "fast" selection and bail.
+      return false;
+  }
 
   // We successfully emitted code for the given LLVM Instruction.
   UpdateValueMap(I, N);
@@ -760,12 +794,14 @@ FastISel::SelectInstruction(const Instruction *I) {
 
   // First, try doing target-independent selection.
   if (SelectOperator(I, I->getOpcode())) {
+    ++NumFastIselSuccessIndependent;
     DL = DebugLoc();
     return true;
   }
 
   // Next, try calling the target to attempt to handle the instruction.
   if (TargetSelectInstruction(I)) {
+    ++NumFastIselSuccessTarget;
     DL = DebugLoc();
     return true;
   }
diff --git a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index 0bca55f..156cc70 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -293,6 +293,8 @@ SelectionDAGLegalize::ExpandConstantFP(ConstantFPSDNode *CFP, bool UseCP) {
 static void ExpandUnalignedStore(StoreSDNode *ST, SelectionDAG &DAG,
                                  const TargetLowering &TLI,
                                  SelectionDAGLegalize *DAGLegalize) {
+  assert(ST->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed stores not implemented!");
   SDValue Chain = ST->getChain();
   SDValue Ptr = ST->getBasePtr();
   SDValue Val = ST->getValue();
@@ -413,6 +415,8 @@ static void
 ExpandUnalignedLoad(LoadSDNode *LD, SelectionDAG &DAG,
                     const TargetLowering &TLI,
                     SDValue &ValResult, SDValue &ChainResult) {
+  assert(LD->getAddressingMode() == ISD::UNINDEXED &&
+         "unaligned indexed loads not implemented!");
   SDValue Chain = LD->getChain();
   SDValue Ptr = LD->getBasePtr();
   EVT VT = LD->getValueType(0);
diff --git a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
index f4164b2..fd24238 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeIntegerTypes.cpp
@@ -20,7 +20,6 @@
 
 #include "LegalizeTypes.h"
 #include "llvm/DerivedTypes.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
 using namespace llvm;
@@ -2927,38 +2926,28 @@ SDValue DAGTypeLegalizer::PromoteIntRes_SCALAR_TO_VECTOR(SDNode *N) {
 SDValue DAGTypeLegalizer::PromoteIntRes_CONCAT_VECTORS(SDNode *N) {
   DebugLoc dl = N->getDebugLoc();
 
-  SDValue Op0 = N->getOperand(0);
-  SDValue Op1 = N->getOperand(1);
-  assert(Op0.getValueType() == Op1.getValueType() &&
-         "Invalid input vector types");
-
   EVT OutVT = N->getValueType(0);
   EVT NOutVT = TLI.getTypeToTransformTo(*DAG.getContext(), OutVT);
   assert(NOutVT.isVector() && "This type must be promoted to a vector type");
 
+  EVT InElemTy = OutVT.getVectorElementType();
   EVT OutElemTy = NOutVT.getVectorElementType();
 
-  unsigned NumElem0 = Op0.getValueType().getVectorNumElements();
-  unsigned NumElem1 = Op1.getValueType().getVectorNumElements();
+  unsigned NumElem = N->getOperand(0).getValueType().getVectorNumElements();
   unsigned NumOutElem = NOutVT.getVectorNumElements();
-  assert(NumElem0 + NumElem1 == NumOutElem &&
-         "Invalid number of incoming elements");
+  unsigned NumOperands = N->getNumOperands();
+  assert(NumElem * NumOperands == NumOutElem &&
+         "Unexpected number of elements");
 
   // Take the elements from the first vector.
   SmallVector<SDValue, 8> Ops(NumOutElem);
-  for (unsigned i = 0; i < NumElem0; ++i) {
-    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              Op0.getValueType().getScalarType(), Op0,
-                              DAG.getIntPtrConstant(i));
-    Ops[i] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
-  }
-
-  // Take the elements from the second vector
-  for (unsigned i = 0; i < NumElem1; ++i) {
-    SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
-                              Op1.getValueType().getScalarType(), Op1,
-                              DAG.getIntPtrConstant(i));
-    Ops[i + NumElem0] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
+  for (unsigned i = 0; i < NumOperands; ++i) {
+    SDValue Op = N->getOperand(i);
+    for (unsigned j = 0; j < NumElem; ++j) {
+      SDValue Ext = DAG.getNode(ISD::EXTRACT_VECTOR_ELT, dl,
+                                InElemTy, Op, DAG.getIntPtrConstant(j));
+      Ops[i * NumElem + j] = DAG.getNode(ISD::ANY_EXTEND, dl, OutElemTy, Ext);
+    }
   }
 
   return DAG.getNode(ISD::BUILD_VECTOR, dl, NOutVT, &Ops[0], Ops.size());
diff --git a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
index 84d334a..7c5472b 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeTypesGeneric.cpp
@@ -21,7 +21,6 @@
 
 #include "LegalizeTypes.h"
 #include "llvm/Target/TargetData.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 using namespace llvm;
 
 //===----------------------------------------------------------------------===//
diff --git a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index cb5df05..ad83565 100644
--- a/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -21,7 +21,6 @@
 //===----------------------------------------------------------------------===//
 
 #include "LegalizeTypes.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
index 7938a37..cd0da37 100644
--- a/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
+++ b/lib/CodeGen/SelectionDAG/ScheduleDAGRRList.cpp
@@ -267,7 +267,7 @@ private:
 
 /// GetCostForDef - Looks up the register class and cost for a given definition.
 /// Typically this just means looking up the representative register class,
-/// but for untyped values (MVT::untyped) it means inspecting the node's
+/// but for untyped values (MVT::Untyped) it means inspecting the node's
 /// opcode to determine what register class is being generated.
 static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
                           const TargetLowering *TLI,
@@ -278,7 +278,7 @@ static void GetCostForDef(const ScheduleDAGSDNodes::RegDefIter &RegDefPos,
 
   // Special handling for untyped values.  These values can only come from
   // the expansion of custom DAG-to-DAG patterns.
-  if (VT == MVT::untyped) {
+  if (VT == MVT::Untyped) {
     const SDNode *Node = RegDefPos.GetNode();
     unsigned Opcode = Node->getMachineOpcode();
 
@@ -948,6 +948,11 @@ SUnit *ScheduleDAGRRList::CopyAndMoveSuccessors(SUnit *SU) {
     if (!TII->unfoldMemoryOperand(*DAG, N, NewNodes))
       return NULL;
 
+    // unfolding an x86 DEC64m operation results in store, dec, load which
+    // can't be handled here so quit
+    if (NewNodes.size() == 3)
+      return NULL;
+
     DEBUG(dbgs() << "Unfolding SU #" << SU->NodeNum << "\n");
     assert(NewNodes.size() == 2 && "Expected a load folding node!");
 
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index 38dd7cc..497c286 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetData.h"
 #include "llvm/Target/TargetLowering.h"
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 18c29b8..8d02350 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -41,7 +41,6 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetData.h"
@@ -1811,8 +1810,8 @@ void SelectionDAGBuilder::visitInvoke(const InvokeInst &I) {
   CopyToExportRegsIfNeeded(&I);
 
   // Update successor info
-  InvokeMBB->addSuccessor(Return);
-  InvokeMBB->addSuccessor(LandingPad);
+  addSuccessorWithWeight(InvokeMBB, Return);
+  addSuccessorWithWeight(InvokeMBB, LandingPad);
 
   // Drop into normal successor.
   DAG.setRoot(DAG.getNode(ISD::BR, getCurDebugLoc(),
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 5cbce3f..8cecc17 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -476,8 +476,8 @@ void SelectionDAGISel::CodeGenAndEmitDAG() {
 #endif
   {
     BlockNumber = FuncInfo->MBB->getNumber();
-    BlockName = MF->getFunction()->getNameStr() + ":" +
-                FuncInfo->MBB->getBasicBlock()->getNameStr();
+    BlockName = MF->getFunction()->getName().str() + ":" +
+                FuncInfo->MBB->getBasicBlock()->getName().str();
   }
   DEBUG(dbgs() << "Initial selection DAG: BB#" << BlockNumber
         << " '" << BlockName << "'\n"; CurDAG->dump());
@@ -892,13 +892,16 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           FastIS->setLastLocalValue(0);
       }
 
+      unsigned NumFastIselRemaining = std::distance(Begin, End);
       // Do FastISel on as many instructions as possible.
       for (; BI != Begin; --BI) {
         const Instruction *Inst = llvm::prior(BI);
 
         // If we no longer require this instruction, skip it.
-        if (isFoldedOrDeadInstruction(Inst, FuncInfo))
+        if (isFoldedOrDeadInstruction(Inst, FuncInfo)) {
+          --NumFastIselRemaining;
           continue;
+        }
 
         // Bottom-up: reset the insert pos at the top, after any local-value
         // instructions.
@@ -906,6 +909,7 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
 
         // Try to select the instruction with FastISel.
         if (FastIS->SelectInstruction(Inst)) {
+          --NumFastIselRemaining;
           ++NumFastIselSuccess;
           // If fast isel succeeded, skip over all the folded instructions, and
           // then see if there is a load right before the selected instructions.
@@ -918,15 +922,18 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           }
           if (BeforeInst != Inst && isa<LoadInst>(BeforeInst) &&
               BeforeInst->hasOneUse() &&
-              TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), Inst, FastIS))
+              TryToFoldFastISelLoad(cast<LoadInst>(BeforeInst), Inst, FastIS)) {
             // If we succeeded, don't re-select the load.
             BI = llvm::next(BasicBlock::const_iterator(BeforeInst));
+            --NumFastIselRemaining;
+            ++NumFastIselSuccess;
+          }
           continue;
         }
 
         // Then handle certain instructions as single-LLVM-Instruction blocks.
         if (isa<CallInst>(Inst)) {
-          ++NumFastIselFailures;
+
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel missed call: ";
             Inst->dump();
@@ -941,24 +948,30 @@ void SelectionDAGISel::SelectAllBasicBlocks(const Function &Fn) {
           bool HadTailCall = false;
           SelectBasicBlock(Inst, BI, HadTailCall);
 
+          // Recompute NumFastIselRemaining as Selection DAG instruction
+          // selection may have handled the call, input args, etc.
+          unsigned RemainingNow = std::distance(Begin, BI);
+          NumFastIselFailures += NumFastIselRemaining - RemainingNow;
+
           // If the call was emitted as a tail call, we're done with the block.
           if (HadTailCall) {
             --BI;
             break;
           }
 
+          NumFastIselRemaining = RemainingNow;
           continue;
         }
 
         if (isa<TerminatorInst>(Inst) && !isa<BranchInst>(Inst)) {
           // Don't abort, and use a different message for terminator misses.
-          ++NumFastIselFailures;
+          NumFastIselFailures += NumFastIselRemaining;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel missed terminator: ";
             Inst->dump();
           }
         } else {
-          ++NumFastIselFailures;
+          NumFastIselFailures += NumFastIselRemaining;
           if (EnableFastISelVerbose || EnableFastISelAbort) {
             dbgs() << "FastISel miss: ";
             Inst->dump();
diff --git a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
index cd1647b..a7cf089 100644
--- a/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
+++ b/lib/CodeGen/SelectionDAG/SelectionDAGPrinter.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineConstantPool.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Analysis/DebugInfo.h"
 #include "llvm/Target/TargetRegisterInfo.h"
 #include "llvm/Target/TargetMachine.h"
@@ -148,7 +147,7 @@ std::string DOTGraphTraits<SelectionDAG*>::getNodeLabel(const SDNode *Node,
 void SelectionDAG::viewGraph(const std::string &Title) {
 // This code is only for debugging!
 #ifndef NDEBUG
-  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getNameStr(),
+  ViewGraph(this, "dag." + getMachineFunction().getFunction()->getName(),
             false, Title);
 #else
   errs() << "SelectionDAG::viewGraph is only available in debug builds on "
diff --git a/lib/CodeGen/ShrinkWrapping.cpp b/lib/CodeGen/ShrinkWrapping.cpp
index 160f38f..13f269e 100644
--- a/lib/CodeGen/ShrinkWrapping.cpp
+++ b/lib/CodeGen/ShrinkWrapping.cpp
@@ -158,7 +158,7 @@ void PEI::initShrinkWrappingInfo() {
   // via --shrink-wrap-func=<funcname>.
 #ifndef NDEBUG
   if (ShrinkWrapFunc != "") {
-    std::string MFName = MF->getFunction()->getNameStr();
+    std::string MFName = MF->getFunction()->getName().str();
     ShrinkWrapThisFunction = (MFName == ShrinkWrapFunc);
   }
 #endif
@@ -1045,7 +1045,7 @@ std::string PEI::getBasicBlockName(const MachineBasicBlock* MBB) {
     return "";
 
   if (MBB->getBasicBlock())
-    return MBB->getBasicBlock()->getNameStr();
+    return MBB->getBasicBlock()->getName().str();
 
   std::ostringstream name;
   name << "_MBB_" << MBB->getNumber();
diff --git a/lib/CodeGen/SjLjEHPrepare.cpp b/lib/CodeGen/SjLjEHPrepare.cpp
index ac88441..c865192 100644
--- a/lib/CodeGen/SjLjEHPrepare.cpp
+++ b/lib/CodeGen/SjLjEHPrepare.cpp
@@ -30,6 +30,7 @@
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/SetVector.h"
 #include "llvm/ADT/SmallPtrSet.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
@@ -48,10 +49,10 @@ namespace {
     Constant *BuiltinSetjmpFn;
     Constant *FrameAddrFn;
     Constant *StackAddrFn;
+    Constant *StackRestoreFn;
     Constant *LSDAAddrFn;
     Value *PersonalityFn;
     Constant *CallSiteFn;
-    Constant *DispatchSetupFn;
     Constant *FuncCtxFn;
     Value *CallSite;
   public:
@@ -107,11 +108,10 @@ bool SjLjEHPass::doInitialization(Module &M) {
                           (Type *)0);
   FrameAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::frameaddress);
   StackAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::stacksave);
+  StackRestoreFn = Intrinsic::getDeclaration(&M, Intrinsic::stackrestore);
   BuiltinSetjmpFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_setjmp);
   LSDAAddrFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_lsda);
   CallSiteFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_callsite);
-  DispatchSetupFn
-    = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_dispatch_setup);
   FuncCtxFn = Intrinsic::getDeclaration(&M, Intrinsic::eh_sjlj_functioncontext);
   PersonalityFn = 0;
 
@@ -365,13 +365,13 @@ void SjLjEHPass::lowerAcrossUnwindEdges(Function &F,
 bool SjLjEHPass::setupEntryBlockAndCallSites(Function &F) {
   SmallVector<ReturnInst*,     16> Returns;
   SmallVector<InvokeInst*,     16> Invokes;
-  SmallVector<LandingPadInst*, 16> LPads;
+  SmallSetVector<LandingPadInst*, 16> LPads;
 
   // Look through the terminators of the basic blocks to find invokes.
   for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB)
     if (InvokeInst *II = dyn_cast<InvokeInst>(BB->getTerminator())) {
       Invokes.push_back(II);
-      LPads.push_back(II->getUnwindDest()->getLandingPadInst());
+      LPads.insert(II->getUnwindDest()->getLandingPadInst());
     } else if (ReturnInst *RI = dyn_cast<ReturnInst>(BB->getTerminator())) {
       Returns.push_back(RI);
     }
@@ -383,7 +383,8 @@ bool SjLjEHPass::setupEntryBlockAndCallSites(Function &F) {
   lowerIncomingArguments(F);
   lowerAcrossUnwindEdges(F, Invokes);
 
-  Value *FuncCtx = setupFunctionContext(F, LPads);
+  Value *FuncCtx =
+    setupFunctionContext(F, makeArrayRef(LPads.begin(), LPads.end()));
   BasicBlock *EntryBB = F.begin();
   Type *Int32Ty = Type::getInt32Ty(F.getContext());
 
@@ -460,6 +461,25 @@ bool SjLjEHPass::setupEntryBlockAndCallSites(Function &F) {
                                         EntryBB->getTerminator());
   Register->setDoesNotThrow();
 
+  // Following any allocas not in the entry block, update the saved SP in the
+  // jmpbuf to the new value.
+  for (Function::iterator BB = F.begin(), E = F.end(); BB != E; ++BB) {
+    if (BB == F.begin())
+      continue;
+    for (BasicBlock::iterator I = BB->begin(), E = BB->end(); I != E; ++I) {
+      if (CallInst *CI = dyn_cast<CallInst>(I)) {
+        if (CI->getCalledFunction() != StackRestoreFn)
+          continue;
+      } else if (!isa<AllocaInst>(I)) {
+        continue;
+      }
+      Instruction *StackAddr = CallInst::Create(StackAddrFn, "sp");
+      StackAddr->insertAfter(I);
+      Instruction *StoreStackAddr = new StoreInst(StackAddr, StackPtr, true);
+      StoreStackAddr->insertAfter(StackAddr);
+    }
+  }
+
   // Finally, for any returns from this function, if this function contains an
   // invoke, add a call to unregister the function context.
   for (unsigned I = 0, E = Returns.size(); I != E; ++I)
diff --git a/lib/CodeGen/StackProtector.cpp b/lib/CodeGen/StackProtector.cpp
index 1f0e5a2..43a6ad8 100644
--- a/lib/CodeGen/StackProtector.cpp
+++ b/lib/CodeGen/StackProtector.cpp
@@ -123,16 +123,11 @@ bool StackProtector::RequiresStackProtector() const {
           // protectors.
           return true;
 
-        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType())) {
-          // We apparently only care about character arrays.
-          if (!AT->getElementType()->isIntegerTy(8))
-            continue;
-
+        if (ArrayType *AT = dyn_cast<ArrayType>(AI->getAllocatedType()))
           // If an array has more than SSPBufferSize bytes of allocated space,
           // then we emit stack protectors.
           if (SSPBufferSize <= TD->getTypeAllocSize(AT))
             return true;
-        }
       }
   }
 
diff --git a/lib/CodeGen/TargetInstrInfoImpl.cpp b/lib/CodeGen/TargetInstrInfoImpl.cpp
index f32678f..08e2b16 100644
--- a/lib/CodeGen/TargetInstrInfoImpl.cpp
+++ b/lib/CodeGen/TargetInstrInfoImpl.cpp
@@ -296,8 +296,7 @@ TargetInstrInfo::foldMemoryOperand(MachineBasicBlock::iterator MI,
     const MachineFrameInfo &MFI = *MF.getFrameInfo();
     assert(MFI.getObjectOffset(FI) != -1);
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                               Flags, MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
     NewMI->addMemOperand(MF, MMO);
diff --git a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
index 3848f4d..c43e5b6 100644
--- a/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
+++ b/lib/CodeGen/TargetLoweringObjectFileImpl.cpp
@@ -358,9 +358,9 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
                                           TAA, TAAParsed, StubSize);
   if (!ErrorCode.empty()) {
     // If invalid, report the error with report_fatal_error.
-    report_fatal_error("Global variable '" + GV->getNameStr() +
-                      "' has an invalid section specifier '" + GV->getSection()+
-                      "': " + ErrorCode + ".");
+    report_fatal_error("Global variable '" + GV->getName() +
+                       "' has an invalid section specifier '" +
+                       GV->getSection() + "': " + ErrorCode + ".");
     // Fall back to dropping it into the data section.
     return DataSection;
   }
@@ -379,9 +379,9 @@ getExplicitSectionGlobal(const GlobalValue *GV, SectionKind Kind,
   // to reject it here.
   if (S->getTypeAndAttributes() != TAA || S->getStubSize() != StubSize) {
     // If invalid, report the error with report_fatal_error.
-    report_fatal_error("Global variable '" + GV->getNameStr() +
-                      "' section type or attributes does not match previous"
-                      " section specifier");
+    report_fatal_error("Global variable '" + GV->getName() +
+                       "' section type or attributes does not match previous"
+                       " section specifier");
   }
 
   return S;
diff --git a/lib/CodeGen/TwoAddressInstructionPass.cpp b/lib/CodeGen/TwoAddressInstructionPass.cpp
index 6796312..a2e8134 100644
--- a/lib/CodeGen/TwoAddressInstructionPass.cpp
+++ b/lib/CodeGen/TwoAddressInstructionPass.cpp
@@ -68,6 +68,7 @@ namespace {
     MachineRegisterInfo *MRI;
     LiveVariables *LV;
     AliasAnalysis *AA;
+    CodeGenOpt::Level OptLevel;
 
     // DistanceMap - Keep track the distance of a MI from the start of the
     // current basic block.
@@ -571,6 +572,9 @@ bool
 TwoAddressInstructionPass::isProfitableToCommute(unsigned regB, unsigned regC,
                                        MachineInstr *MI, MachineBasicBlock *MBB,
                                        unsigned Dist) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
   // Determine if it's profitable to commute this two address instruction. In
   // general, we want no uses between this instruction and the definition of
   // the two-address register.
@@ -924,7 +928,7 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
   if (isTwoAddrUse(*KillMI, Reg, DstReg))
     return false;
 
-  bool SeenStore;
+  bool SeenStore = true;
   if (!MI->isSafeToMove(TII, AA, SeenStore))
     return false;
 
@@ -933,6 +937,7 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
     return false;
 
   SmallSet<unsigned, 2> Uses;
+  SmallSet<unsigned, 2> Kills;
   SmallSet<unsigned, 2> Defs;
   for (unsigned i = 0, e = MI->getNumOperands(); i != e; ++i) {
     const MachineOperand &MO = MI->getOperand(i);
@@ -943,8 +948,11 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
       continue;
     if (MO.isDef())
       Defs.insert(MOReg);
-    else
+    else {
       Uses.insert(MOReg);
+      if (MO.isKill() && MOReg != Reg)
+        Kills.insert(MOReg);
+    }
   }
 
   // Move the copies connected to MI down as well.
@@ -991,7 +999,8 @@ TwoAddressInstructionPass::RescheduleMIBelowKill(MachineBasicBlock *MBB,
       } else {
         if (Defs.count(MOReg))
           return false;
-        if (MOReg != Reg && MO.isKill() && Uses.count(MOReg))
+        if (MOReg != Reg &&
+            ((MO.isKill() && Uses.count(MOReg)) || Kills.count(MOReg)))
           // Don't want to extend other live ranges and update kills.
           return false;
       }
@@ -1071,7 +1080,7 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
   if (isTwoAddrUse(*KillMI, Reg, DstReg))
     return false;
 
-  bool SeenStore;
+  bool SeenStore = true;
   if (!KillMI->isSafeToMove(TII, AA, SeenStore))
     return false;
 
@@ -1115,6 +1124,7 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
         MCID.isTerminator())
       // Don't move pass calls, etc.
       return false;
+    SmallVector<unsigned, 2> OtherDefs;
     for (unsigned i = 0, e = OtherMI->getNumOperands(); i != e; ++i) {
       const MachineOperand &MO = OtherMI->getOperand(i);
       if (!MO.isReg())
@@ -1131,15 +1141,20 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
           // Don't want to extend other live ranges and update kills.
           return false;
       } else {
-        if (Uses.count(MOReg))
-          return false;
-        if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
-            LiveDefs.count(MOReg))
-          return false;
-        // Physical register def is seen.
-        Defs.erase(MOReg);
+        OtherDefs.push_back(MOReg);
       }
     }
+
+    for (unsigned i = 0, e = OtherDefs.size(); i != e; ++i) {
+      unsigned MOReg = OtherDefs[i];
+      if (Uses.count(MOReg))
+        return false;
+      if (TargetRegisterInfo::isPhysicalRegister(MOReg) &&
+          LiveDefs.count(MOReg))
+        return false;
+      // Physical register def is seen.
+      Defs.erase(MOReg);
+    }
   }
 
   // Move the old kill above MI, don't forget to move debug info as well.
@@ -1152,7 +1167,7 @@ TwoAddressInstructionPass::RescheduleKillAboveMI(MachineBasicBlock *MBB,
     --From;
   MBB->splice(InsertPos, MBB, From, To);
 
-  nmi = llvm::prior(mi); // Backtrack so we process the moved instruction.
+  nmi = llvm::prior(InsertPos); // Backtrack so we process the moved instr.
   DistanceMap.erase(DI);
 
   if (LV) {
@@ -1182,6 +1197,9 @@ TryInstructionTransform(MachineBasicBlock::iterator &mi,
                         MachineFunction::iterator &mbbi,
                         unsigned SrcIdx, unsigned DstIdx, unsigned Dist,
                         SmallPtrSet<MachineInstr*, 8> &Processed) {
+  if (OptLevel == CodeGenOpt::None)
+    return false;
+
   MachineInstr &MI = *mi;
   const MCInstrDesc &MCID = MI.getDesc();
   unsigned regA = MI.getOperand(DstIdx).getReg();
@@ -1377,6 +1395,7 @@ bool TwoAddressInstructionPass::runOnMachineFunction(MachineFunction &MF) {
   InstrItins = TM.getInstrItineraryData();
   LV = getAnalysisIfAvailable<LiveVariables>();
   AA = &getAnalysis<AliasAnalysis>();
+  OptLevel = TM.getOptLevel();
 
   bool MadeChange = false;
 
diff --git a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
index 055875c..1bcdbe2 100644
--- a/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
+++ b/lib/ExecutionEngine/Interpreter/ExternalFunctions.cpp
@@ -94,15 +94,16 @@ static ExFunc lookupFunction(const Function *F) {
   FunctionType *FT = F->getFunctionType();
   for (unsigned i = 0, e = FT->getNumContainedTypes(); i != e; ++i)
     ExtName += getTypeID(FT->getContainedType(i));
-  ExtName + "_" + F->getNameStr();
+  ExtName += "_" + F->getName().str();
 
   sys::ScopedLock Writer(*FunctionsLock);
   ExFunc FnPtr = FuncNames[ExtName];
   if (FnPtr == 0)
-    FnPtr = FuncNames["lle_X_" + F->getNameStr()];
+    FnPtr = FuncNames["lle_X_" + F->getName().str()];
   if (FnPtr == 0)  // Try calling a generic function... if it exists...
     FnPtr = (ExFunc)(intptr_t)
-      sys::DynamicLibrary::SearchForAddressOfSymbol("lle_X_"+F->getNameStr());
+      sys::DynamicLibrary::SearchForAddressOfSymbol("lle_X_" +
+                                                    F->getName().str());
   if (FnPtr != 0)
     ExportedFunctions->insert(std::make_pair(F, FnPtr));  // Cache for later
   return FnPtr;
diff --git a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
index e71c20b..2e90968 100644
--- a/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
+++ b/lib/ExecutionEngine/JIT/JITDebugRegisterer.cpp
@@ -118,7 +118,7 @@ std::string JITDebugRegisterer::MakeELF(const Function *F, DebugInfo &I) {
   if (JITEmitDebugInfoToDisk) {
     std::string Filename;
     raw_string_ostream O2(Filename);
-    O2 << "/tmp/llvm_function_" << I.FnStart << "_" << F->getNameStr() << ".o";
+    O2 << "/tmp/llvm_function_" << I.FnStart << "_" << F->getName() << ".o";
     O2.flush();
     std::string Errors;
     raw_fd_ostream O3(Filename.c_str(), Errors);
diff --git a/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 7c8a740..d5f407d 100644
--- a/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -64,7 +64,7 @@ MCJIT::MCJIT(Module *m, TargetMachine *tm, TargetJITInfo &tji,
 
   // Turn the machine code intermediate representation into bytes in memory
   // that may be executed.
-  if (TM->addPassesToEmitMC(PM, Ctx, OS, CodeGenOpt::Default, false)) {
+  if (TM->addPassesToEmitMC(PM, Ctx, OS, false)) {
     report_fatal_error("Target does not support MC emission!");
   }
 
diff --git a/lib/MC/ELFObjectWriter.cpp b/lib/MC/ELFObjectWriter.cpp
index eee002a..bd28069 100644
--- a/lib/MC/ELFObjectWriter.cpp
+++ b/lib/MC/ELFObjectWriter.cpp
@@ -1579,8 +1579,8 @@ unsigned PPCELFObjectWriter::GetRelocType(const MCValue &Target,
   return Type;
 }
 
-void
-PPCELFObjectWriter::adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
+void PPCELFObjectWriter::
+adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) {
   switch ((unsigned)Fixup.getKind()) {
     case PPC::fixup_ppc_ha16:
     case PPC::fixup_ppc_lo16:
@@ -1825,6 +1825,12 @@ MipsELFObjectWriter::MipsELFObjectWriter(MCELFObjectTargetWriter *MOTW,
 
 MipsELFObjectWriter::~MipsELFObjectWriter() {}
 
+// FIXME: get the real EABI Version from the Triple.
+void MipsELFObjectWriter::WriteEFlags() {
+  Write32(ELF::EF_MIPS_NOREORDER |
+          ELF::EF_MIPS_ARCH_32R2);
+}
+
 unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
                                            const MCFixup &Fixup,
                                            bool IsPCRel,
@@ -1840,6 +1846,9 @@ unsigned MipsELFObjectWriter::GetRelocType(const MCValue &Target,
   case FK_Data_4:
     Type = ELF::R_MIPS_32;
     break;
+  case FK_GPRel_4:
+    Type = ELF::R_MIPS_GPREL32;
+    break;
   case Mips::fixup_Mips_GPREL16:
     Type = ELF::R_MIPS_GPREL16;
     break;
diff --git a/lib/MC/ELFObjectWriter.h b/lib/MC/ELFObjectWriter.h
index 862b085..7838206 100644
--- a/lib/MC/ELFObjectWriter.h
+++ b/lib/MC/ELFObjectWriter.h
@@ -240,33 +240,38 @@ class ELFObjectWriter : public MCObjectWriter {
       F.getContents() += StringRef(buf, 8);
     }
 
-    virtual void WriteHeader(uint64_t SectionDataSize, unsigned NumberOfSections);
+    virtual void WriteHeader(uint64_t SectionDataSize,
+                             unsigned NumberOfSections);
 
     /// Default e_flags = 0
     virtual void WriteEFlags() { Write32(0); }
 
-    virtual void WriteSymbolEntry(MCDataFragment *SymtabF, MCDataFragment *ShndxF,
-                          uint64_t name, uint8_t info,
-                          uint64_t value, uint64_t size,
-                          uint8_t other, uint32_t shndx,
-                          bool Reserved);
+    virtual void WriteSymbolEntry(MCDataFragment *SymtabF,
+                                  MCDataFragment *ShndxF,
+                                  uint64_t name, uint8_t info,
+                                  uint64_t value, uint64_t size,
+                                  uint8_t other, uint32_t shndx,
+                                  bool Reserved);
 
     virtual void WriteSymbol(MCDataFragment *SymtabF,  MCDataFragment *ShndxF,
                      ELFSymbolData &MSD,
                      const MCAsmLayout &Layout);
 
     typedef DenseMap<const MCSectionELF*, uint32_t> SectionIndexMapTy;
-    virtual void WriteSymbolTable(MCDataFragment *SymtabF, MCDataFragment *ShndxF,
-                          const MCAssembler &Asm,
-                          const MCAsmLayout &Layout,
-                          const SectionIndexMapTy &SectionIndexMap);
-
-    virtual void RecordRelocation(const MCAssembler &Asm, const MCAsmLayout &Layout,
-                                  const MCFragment *Fragment, const MCFixup &Fixup,
+    virtual void WriteSymbolTable(MCDataFragment *SymtabF,
+                                  MCDataFragment *ShndxF,
+                                  const MCAssembler &Asm,
+                                  const MCAsmLayout &Layout,
+                                  const SectionIndexMapTy &SectionIndexMap);
+
+    virtual void RecordRelocation(const MCAssembler &Asm,
+                                  const MCAsmLayout &Layout,
+                                  const MCFragment *Fragment,
+                                  const MCFixup &Fixup,
                                   MCValue Target, uint64_t &FixedValue);
 
     virtual uint64_t getSymbolIndexInSymbolTable(const MCAssembler &Asm,
-                                         const MCSymbol *S);
+                                                 const MCSymbol *S);
 
     // Map from a group section to the signature symbol
     typedef DenseMap<const MCSectionELF*, const MCSymbol*> GroupMapTy;
@@ -347,7 +352,8 @@ class ELFObjectWriter : public MCObjectWriter {
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
                                   int64_t Addend) = 0;
-    virtual void adjustFixupOffset(const MCFixup &Fixup, uint64_t &RelocOffset) { }
+    virtual void adjustFixupOffset(const MCFixup &Fixup,
+                                   uint64_t &RelocOffset) {}
   };
 
   //===- X86ELFObjectWriter -------------------------------------------===//
@@ -436,6 +442,8 @@ class ELFObjectWriter : public MCObjectWriter {
                         bool IsLittleEndian);
 
     virtual ~MipsELFObjectWriter();
+    virtual void WriteEFlags();
+
   protected:
     virtual unsigned GetRelocType(const MCValue &Target, const MCFixup &Fixup,
                                   bool IsPCRel, bool IsRelocWithSymbol,
diff --git a/lib/MC/MCAsmBackend.cpp b/lib/MC/MCAsmBackend.cpp
index 2c150f4..936ed55 100644
--- a/lib/MC/MCAsmBackend.cpp
+++ b/lib/MC/MCAsmBackend.cpp
@@ -21,14 +21,18 @@ MCAsmBackend::~MCAsmBackend() {
 const MCFixupKindInfo &
 MCAsmBackend::getFixupKindInfo(MCFixupKind Kind) const {
   static const MCFixupKindInfo Builtins[] = {
-    { "FK_Data_1", 0, 8, 0 },
-    { "FK_Data_2", 0, 16, 0 },
-    { "FK_Data_4", 0, 32, 0 },
-    { "FK_Data_8", 0, 64, 0 },
-    { "FK_PCRel_1", 0, 8, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_Data_1",  0,  8, 0 },
+    { "FK_Data_2",  0, 16, 0 },
+    { "FK_Data_4",  0, 32, 0 },
+    { "FK_Data_8",  0, 64, 0 },
+    { "FK_PCRel_1", 0,  8, MCFixupKindInfo::FKF_IsPCRel },
     { "FK_PCRel_2", 0, 16, MCFixupKindInfo::FKF_IsPCRel },
     { "FK_PCRel_4", 0, 32, MCFixupKindInfo::FKF_IsPCRel },
-    { "FK_PCRel_8", 0, 64, MCFixupKindInfo::FKF_IsPCRel }
+    { "FK_PCRel_8", 0, 64, MCFixupKindInfo::FKF_IsPCRel },
+    { "FK_GPRel_1", 0,  8, 0 },
+    { "FK_GPRel_2", 0, 16, 0 },
+    { "FK_GPRel_4", 0, 32, 0 },
+    { "FK_GPRel_8", 0, 64, 0 }
   };
   
   assert((size_t)Kind <= sizeof(Builtins) / sizeof(Builtins[0]) &&
diff --git a/lib/MC/MCCodeGenInfo.cpp b/lib/MC/MCCodeGenInfo.cpp
index 236e7de..d9dcfd0 100644
--- a/lib/MC/MCCodeGenInfo.cpp
+++ b/lib/MC/MCCodeGenInfo.cpp
@@ -15,7 +15,9 @@
 #include "llvm/MC/MCCodeGenInfo.h"
 using namespace llvm;
 
-void MCCodeGenInfo::InitMCCodeGenInfo(Reloc::Model RM, CodeModel::Model CM) {
+void MCCodeGenInfo::InitMCCodeGenInfo(Reloc::Model RM, CodeModel::Model CM,
+                                      CodeGenOpt::Level OL) {
   RelocationModel = RM;
   CMModel = CM;
+  OptLevel = OL;
 }
diff --git a/lib/MC/MCELFStreamer.cpp b/lib/MC/MCELFStreamer.cpp
index 0b366da..0ea3c64 100644
--- a/lib/MC/MCELFStreamer.cpp
+++ b/lib/MC/MCELFStreamer.cpp
@@ -205,10 +205,10 @@ void MCELFStreamer::EmitCommonSymbol(MCSymbol *Symbol, uint64_t Size,
 
   if (MCELF::GetBinding(SD) == ELF_STB_Local) {
     const MCSection *Section = getAssembler().getContext().getELFSection(".bss",
-                                                                    ELF::SHT_NOBITS,
-                                                                    ELF::SHF_WRITE |
-                                                                    ELF::SHF_ALLOC,
-                                                                    SectionKind::getBSS());
+                                                         ELF::SHT_NOBITS,
+                                                         ELF::SHF_WRITE |
+                                                         ELF::SHF_ALLOC,
+                                                         SectionKind::getBSS());
     Symbol->setSection(*Section);
 
     struct LocalCommon L = {&SD, Size, ByteAlignment};
diff --git a/lib/MC/MCExpr.cpp b/lib/MC/MCExpr.cpp
index 4af27ab..da297fb 100644
--- a/lib/MC/MCExpr.cpp
+++ b/lib/MC/MCExpr.cpp
@@ -26,38 +26,6 @@ STATISTIC(MCExprEvaluate, "Number of MCExpr evaluations");
 }
 }
 
-static bool printMipsSymbolRef(const MCSymbolRefExpr &SRE,
-                               const MCSymbol &Sym, raw_ostream &OS) {
-  MCSymbolRefExpr::VariantKind Kind= SRE.getKind();
-
-  switch (Kind) {
-  default:
-    return false;
-  case MCSymbolRefExpr::VK_Mips_None:     break;
-  case MCSymbolRefExpr::VK_Mips_GPREL:    OS << "%gp_rel("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_CALL: OS << "%call16("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT:      OS << "%got(";    break;
-  case MCSymbolRefExpr::VK_Mips_ABS_HI:   OS << "%hi(";     break;
-  case MCSymbolRefExpr::VK_Mips_ABS_LO:   OS << "%lo(";     break;
-  case MCSymbolRefExpr::VK_Mips_TLSGD:    OS << "%tlsgd(";  break;
-  case MCSymbolRefExpr::VK_Mips_GOTTPREL: OS << "%gottprel("; break;
-  case MCSymbolRefExpr::VK_Mips_TPREL_HI: OS << "%tprel_hi("; break;
-  case MCSymbolRefExpr::VK_Mips_TPREL_LO: OS << "%tprel_lo("; break;
-  case MCSymbolRefExpr::VK_Mips_GPOFF_HI: OS << "%hi(%neg(%gp_rel("; break;
-  case MCSymbolRefExpr::VK_Mips_GPOFF_LO: OS << "%lo(%neg(%gp_rel("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_DISP: OS << "%got_disp("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_PAGE: OS << "%got_page("; break;
-  case MCSymbolRefExpr::VK_Mips_GOT_OFST: OS << "%got_ofst("; break;
-  }
-
-  OS << Sym;
-
-  if (Kind != MCSymbolRefExpr::VK_Mips_None)
-    OS << ')';
-
-  return true;
-}
-
 void MCExpr::print(raw_ostream &OS) const {
   switch (getKind()) {
   case MCExpr::Target:
@@ -73,9 +41,6 @@ void MCExpr::print(raw_ostream &OS) const {
     // absolute names.
     bool UseParens = Sym.getName()[0] == '$';
 
-    if (printMipsSymbolRef(SRE, Sym, OS))
-      return;
-
     if (SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_HA16 ||
         SRE.getKind() == MCSymbolRefExpr::VK_PPC_DARWIN_LO16) {
       OS << MCSymbolRefExpr::getVariantKindName(SRE.getKind());
diff --git a/lib/MC/MCModule.cpp b/lib/MC/MCModule.cpp
index b1d09d9..f563160 100644
--- a/lib/MC/MCModule.cpp
+++ b/lib/MC/MCModule.cpp
@@ -1,4 +1,4 @@
-//===- lib/MC/MCModule.cpp - MCModule implementation --------------------------===//
+//===- lib/MC/MCModule.cpp - MCModule implementation ----------------------===//
 //
 //                     The LLVM Compiler Infrastructure
 //
diff --git a/lib/MC/MCObjectFileInfo.cpp b/lib/MC/MCObjectFileInfo.cpp
index d76e48b..7d23541 100644
--- a/lib/MC/MCObjectFileInfo.cpp
+++ b/lib/MC/MCObjectFileInfo.cpp
@@ -56,8 +56,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 
   TLSThreadInitSection
     = Ctx->getMachOSection("__DATA", "__thread_init",
-                           MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
-                           SectionKind::getDataRel());
+                          MCSectionMachO::S_THREAD_LOCAL_INIT_FUNCTION_POINTERS,
+                          SectionKind::getDataRel());
 
   CStringSection // .cstring
     = Ctx->getMachOSection("__TEXT", "__cstring",
@@ -221,8 +221,8 @@ void MCObjectFileInfo::InitMachOMCObjectFileInfo(Triple T) {
 void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
   if (T.getArch() == Triple::x86) {
     PersonalityEncoding = (RelocM == Reloc::PIC_)
-      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
-      : dwarf::DW_EH_PE_absptr;
+     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+     : dwarf::DW_EH_PE_absptr;
     LSDAEncoding = (RelocM == Reloc::PIC_)
       ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
       : dwarf::DW_EH_PE_absptr;
@@ -230,8 +230,8 @@ void MCObjectFileInfo::InitELFMCObjectFileInfo(Triple T) {
       ? dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
       : dwarf::DW_EH_PE_absptr;
     TTypeEncoding = (RelocM == Reloc::PIC_)
-      ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
-      : dwarf::DW_EH_PE_absptr;
+     ? dwarf::DW_EH_PE_indirect | dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4
+     : dwarf::DW_EH_PE_absptr;
   } else if (T.getArch() == Triple::x86_64) {
     FDECFIEncoding = dwarf::DW_EH_PE_pcrel | dwarf::DW_EH_PE_sdata4;
 
diff --git a/lib/MC/MCObjectStreamer.cpp b/lib/MC/MCObjectStreamer.cpp
index a04ae08..90c957f 100644
--- a/lib/MC/MCObjectStreamer.cpp
+++ b/lib/MC/MCObjectStreamer.cpp
@@ -245,6 +245,16 @@ void MCObjectStreamer::EmitValueToOffset(const MCExpr *Offset,
   EmitFill(Res, Value, 0);
 }
 
+// Associate GPRel32 fixup with data and resize data area
+void MCObjectStreamer::EmitGPRel32Value(const MCExpr *Value) {
+  MCDataFragment *DF = getOrCreateDataFragment();
+
+  DF->addFixup(MCFixup::Create(DF->getContents().size(),
+                               Value,
+                               FK_GPRel_4));
+  DF->getContents().resize(DF->getContents().size() + 4, 0);
+}
+
 void MCObjectStreamer::Finish() {
   // Dump out the dwarf file & directory tables and line tables.
   if (getContext().hasDwarfFiles())
diff --git a/lib/Object/COFFObjectFile.cpp b/lib/Object/COFFObjectFile.cpp
index 2abfb44..c6ce562 100644
--- a/lib/Object/COFFObjectFile.cpp
+++ b/lib/Object/COFFObjectFile.cpp
@@ -226,7 +226,9 @@ error_code COFFObjectFile::getSymbolNMTypeChar(DataRefImpl Symb,
     if (symb->StorageClass == COFF::IMAGE_SYM_CLASS_WEAK_EXTERNAL) {
       Result = 'w';
       return object_error::success; // Don't do ::toupper.
-    } else
+    } else if (symb->Value != 0) // Check for common symbols.
+      ret = 'c';
+    else
       ret = 'u';
     break;
   case COFF::IMAGE_SYM_ABSOLUTE:
diff --git a/lib/Support/APInt.cpp b/lib/Support/APInt.cpp
index 3774c52..55cb433 100644
--- a/lib/Support/APInt.cpp
+++ b/lib/Support/APInt.cpp
@@ -870,30 +870,43 @@ unsigned APInt::countPopulationSlowCase() const {
   return Count;
 }
 
+/// Perform a logical right-shift from Src to Dst, which must be equal or
+/// non-overlapping, of Words words, by Shift, which must be less than 64.
+static void lshrNear(uint64_t *Dst, uint64_t *Src, unsigned Words,
+                     unsigned Shift) {
+  uint64_t Carry = 0;
+  for (int I = Words - 1; I >= 0; --I) {
+    uint64_t Tmp = Src[I];
+    Dst[I] = (Tmp >> Shift) | Carry;
+    Carry = Tmp << (64 - Shift);
+  }
+}
+
 APInt APInt::byteSwap() const {
   assert(BitWidth >= 16 && BitWidth % 16 == 0 && "Cannot byteswap!");
   if (BitWidth == 16)
     return APInt(BitWidth, ByteSwap_16(uint16_t(VAL)));
-  else if (BitWidth == 32)
+  if (BitWidth == 32)
     return APInt(BitWidth, ByteSwap_32(unsigned(VAL)));
-  else if (BitWidth == 48) {
+  if (BitWidth == 48) {
     unsigned Tmp1 = unsigned(VAL >> 16);
     Tmp1 = ByteSwap_32(Tmp1);
     uint16_t Tmp2 = uint16_t(VAL);
     Tmp2 = ByteSwap_16(Tmp2);
     return APInt(BitWidth, (uint64_t(Tmp2) << 32) | Tmp1);
-  } else if (BitWidth == 64)
+  }
+  if (BitWidth == 64)
     return APInt(BitWidth, ByteSwap_64(VAL));
-  else {
-    APInt Result(BitWidth, 0);
-    char *pByte = (char*)Result.pVal;
-    for (unsigned i = 0; i < BitWidth / APINT_WORD_SIZE / 2; ++i) {
-      char Tmp = pByte[i];
-      pByte[i] = pByte[BitWidth / APINT_WORD_SIZE - 1 - i];
-      pByte[BitWidth / APINT_WORD_SIZE - i - 1] = Tmp;
-    }
-    return Result;
+
+  APInt Result(getNumWords() * APINT_BITS_PER_WORD, 0);
+  for (unsigned I = 0, N = getNumWords(); I != N; ++I)
+    Result.pVal[I] = ByteSwap_64(pVal[N - I - 1]);
+  if (Result.BitWidth != BitWidth) {
+    lshrNear(Result.pVal, Result.pVal, getNumWords(),
+             Result.BitWidth - BitWidth);
+    Result.BitWidth = BitWidth;
   }
+  return Result;
 }
 
 APInt llvm::APIntOps::GreatestCommonDivisor(const APInt& API1,
@@ -1232,11 +1245,7 @@ APInt APInt::lshr(unsigned shiftAmt) const {
 
   // If we are shifting less than a word, compute the shift with a simple carry
   if (shiftAmt < APINT_BITS_PER_WORD) {
-    uint64_t carry = 0;
-    for (int i = getNumWords()-1; i >= 0; --i) {
-      val[i] = (pVal[i] >> shiftAmt) | carry;
-      carry = pVal[i] << (APINT_BITS_PER_WORD - shiftAmt);
-    }
+    lshrNear(val, pVal, getNumWords(), shiftAmt);
     return APInt(val, BitWidth).clearUnusedBits();
   }
 
diff --git a/lib/Support/ConstantRange.cpp b/lib/Support/ConstantRange.cpp
index c29cb53..5743479 100644
--- a/lib/Support/ConstantRange.cpp
+++ b/lib/Support/ConstantRange.cpp
@@ -466,10 +466,8 @@ ConstantRange ConstantRange::signExtend(uint32_t DstTySize) const {
 /// correspond to the possible range of values as if the source range had been
 /// truncated to the specified type.
 ConstantRange ConstantRange::truncate(uint32_t DstTySize) const {
-  unsigned SrcTySize = getBitWidth();
-  assert(SrcTySize > DstTySize && "Not a value truncation");
-  APInt Size(APInt::getLowBitsSet(SrcTySize, DstTySize));
-  if (isFullSet() || getSetSize().ugt(Size))
+  assert(getBitWidth() > DstTySize && "Not a value truncation");
+  if (isFullSet() || getSetSize().getActiveBits() > DstTySize)
     return ConstantRange(DstTySize, /*isFullSet=*/true);
 
   return ConstantRange(Lower.trunc(DstTySize), Upper.trunc(DstTySize));
diff --git a/lib/Support/MemoryBuffer.cpp b/lib/Support/MemoryBuffer.cpp
index cc3f6a8..1a40972 100644
--- a/lib/Support/MemoryBuffer.cpp
+++ b/lib/Support/MemoryBuffer.cpp
@@ -14,6 +14,7 @@
 #include "llvm/Support/MemoryBuffer.h"
 #include "llvm/ADT/OwningPtr.h"
 #include "llvm/ADT/SmallString.h"
+#include "llvm/Config/config.h"
 #include "llvm/Support/MathExtras.h"
 #include "llvm/Support/Errno.h"
 #include "llvm/Support/Path.h"
@@ -320,23 +321,24 @@ error_code MemoryBuffer::getOpenFile(int FD, const char *Filename,
   char *BufPtr = const_cast<char*>(SB->getBufferStart());
 
   size_t BytesLeft = MapSize;
+#ifndef HAVE_PREAD
   if (lseek(FD, Offset, SEEK_SET) == -1)
     return error_code(errno, posix_category());
+#endif
 
   while (BytesLeft) {
+#ifdef HAVE_PREAD
+    ssize_t NumRead = ::pread(FD, BufPtr, BytesLeft, MapSize-BytesLeft+Offset);
+#else
     ssize_t NumRead = ::read(FD, BufPtr, BytesLeft);
+#endif
     if (NumRead == -1) {
       if (errno == EINTR)
         continue;
       // Error while reading.
       return error_code(errno, posix_category());
-    } else if (NumRead == 0) {
-      // We hit EOF early, truncate and terminate buffer.
-      Buf->BufferEnd = BufPtr;
-      *BufPtr = 0;
-      result.swap(SB);
-      return success;
     }
+    assert(NumRead != 0 && "fstat reported an invalid file size.");
     BytesLeft -= NumRead;
     BufPtr += NumRead;
   }
diff --git a/lib/Target/ARM/ARMBaseInstrInfo.cpp b/lib/Target/ARM/ARMBaseInstrInfo.cpp
index 7a7267a..9315348 100644
--- a/lib/Target/ARM/ARMBaseInstrInfo.cpp
+++ b/lib/Target/ARM/ARMBaseInstrInfo.cpp
@@ -28,7 +28,6 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGNodes.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/BranchProbability.h"
@@ -47,7 +46,7 @@ EnableARM3Addr("enable-arm-3-addr-conv", cl::Hidden,
                cl::desc("Enable ARM 2-addr to 3-addr conv"));
 
 static cl::opt<bool>
-WidenVMOVS("widen-vmovs", cl::Hidden,
+WidenVMOVS("widen-vmovs", cl::Hidden, cl::init(true),
            cl::desc("Widen ARM vmovs to vmovd when possible"));
 
 /// ARM_MLxEntry - Record information about MLA / MLS instructions.
@@ -710,8 +709,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Align = MFI.getObjectAlignment(FI);
 
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(MachinePointerInfo(
-                                         PseudoSourceValue::getFixedStack(FI)),
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                             MachineMemOperand::MOStore,
                             MFI.getObjectSize(FI),
                             Align);
@@ -862,7 +860,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
   unsigned Align = MFI.getObjectAlignment(FI);
   MachineMemOperand *MMO =
     MF.getMachineMemOperand(
-                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+                    MachinePointerInfo::getFixedStack(FI),
                             MachineMemOperand::MOLoad,
                             MFI.getObjectSize(FI),
                             Align);
diff --git a/lib/Target/ARM/ARMExpandPseudoInsts.cpp b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
index fb7d96a..fc464ea 100644
--- a/lib/Target/ARM/ARMExpandPseudoInsts.cpp
+++ b/lib/Target/ARM/ARMExpandPseudoInsts.cpp
@@ -824,7 +824,7 @@ bool ARMExpandPseudo::ExpandMI(MachineBasicBlock &MBB,
       MI.eraseFromParent();
       return true;
     }
-    case ARM::Int_eh_sjlj_dispatchsetup: {
+    case ARM::eh_sjlj_dispatchsetup: {
       MachineFunction &MF = *MI.getParent()->getParent();
       const ARMBaseInstrInfo *AII =
         static_cast<const ARMBaseInstrInfo*>(TII);
diff --git a/lib/Target/ARM/ARMFastISel.cpp b/lib/Target/ARM/ARMFastISel.cpp
index 4df084f..9bae422 100644
--- a/lib/Target/ARM/ARMFastISel.cpp
+++ b/lib/Target/ARM/ARMFastISel.cpp
@@ -37,7 +37,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/CallSite.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -197,8 +196,6 @@ class ARMFastISel : public FastISel {
 
     // Call handling routines.
   private:
-    bool FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src, EVT SrcVT,
-                        unsigned &ResultReg);
     CCAssignFn *CCAssignFnForCall(CallingConv::ID CC, bool Return);
     bool ProcessCallArgs(SmallVectorImpl<Value*> &Args,
                          SmallVectorImpl<unsigned> &ArgRegs,
@@ -687,6 +684,8 @@ unsigned ARMFastISel::TargetMaterializeConstant(const Constant *C) {
   return 0;
 }
 
+// TODO: unsigned ARMFastISel::TargetMaterializeFloatZero(const ConstantFP *CF);
+
 unsigned ARMFastISel::TargetMaterializeAlloca(const AllocaInst *AI) {
   // Don't handle dynamic allocas.
   if (!FuncInfo.StaticAllocaMap.count(AI)) return 0;
@@ -1115,7 +1114,7 @@ bool ARMFastISel::ARMEmitStore(EVT VT, unsigned SrcReg, Address &Addr) {
   // Create the base instruction, then add the operands.
   MachineInstrBuilder MIB = BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                                     TII.get(StrOpc))
-                            .addReg(SrcReg, getKillRegState(true));
+                            .addReg(SrcReg);
   AddLoadStoreOperands(VT, Addr, MIB, MachineMemOperand::MOStore, useAM3);
   return true;
 }
@@ -1304,6 +1303,8 @@ bool ARMFastISel::ARMEmitCmp(const Value *Src1Value, const Value *Src2Value,
   int Imm = 0;
   bool UseImm = false;
   bool isNegativeImm = false;
+  // FIXME: At -O0 we don't have anything that canonicalizes operand order.
+  // Thus, Src1Value may be a ConstantInt, but we're missing it.
   if (const ConstantInt *ConstInt = dyn_cast<ConstantInt>(Src2Value)) {
     if (SrcVT == MVT::i32 || SrcVT == MVT::i16 || SrcVT == MVT::i8 ||
         SrcVT == MVT::i1) {
@@ -1669,12 +1670,6 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
   if (isFloat && !Subtarget->hasVFP2())
     return false;
 
-  unsigned Op1 = getRegForValue(I->getOperand(0));
-  if (Op1 == 0) return false;
-
-  unsigned Op2 = getRegForValue(I->getOperand(1));
-  if (Op2 == 0) return false;
-
   unsigned Opc;
   bool is64bit = VT == MVT::f64 || VT == MVT::i64;
   switch (ISDOpcode) {
@@ -1689,6 +1684,12 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
       Opc = is64bit ? ARM::VMULD : ARM::VMULS;
       break;
   }
+  unsigned Op1 = getRegForValue(I->getOperand(0));
+  if (Op1 == 0) return false;
+
+  unsigned Op2 = getRegForValue(I->getOperand(1));
+  if (Op2 == 0) return false;
+
   unsigned ResultReg = createResultReg(TLI.getRegClassFor(VT));
   AddOptionalDefs(BuildMI(*FuncInfo.MBB, FuncInfo.InsertPt, DL,
                           TII.get(Opc), ResultReg)
@@ -1699,18 +1700,6 @@ bool ARMFastISel::SelectBinaryOp(const Instruction *I, unsigned ISDOpcode) {
 
 // Call Handling Code
 
-bool ARMFastISel::FastEmitExtend(ISD::NodeType Opc, EVT DstVT, unsigned Src,
-                                 EVT SrcVT, unsigned &ResultReg) {
-  unsigned RR = FastEmit_r(SrcVT.getSimpleVT(), DstVT.getSimpleVT(), Opc,
-                           Src, /*TODO: Kill=*/false);
-
-  if (RR != 0) {
-    ResultReg = RR;
-    return true;
-  } else
-    return false;
-}
-
 // This is largely taken directly from CCAssignFnForNode - we don't support
 // varargs in FastISel so that part has been removed.
 // TODO: We may not support all of this.
@@ -2119,9 +2108,6 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     if (IntrMemName && e-i <= 2)
       break;
 
-    unsigned Arg = getRegForValue(*i);
-    if (Arg == 0)
-      return false;
     ISD::ArgFlagsTy Flags;
     unsigned AttrInd = i - CS.arg_begin() + 1;
     if (CS.paramHasAttr(AttrInd, Attribute::SExt))
@@ -2141,6 +2127,11 @@ bool ARMFastISel::SelectCall(const Instruction *I,
     if (!isTypeLegal(ArgTy, ArgVT) && ArgVT != MVT::i16 && ArgVT != MVT::i8 &&
         ArgVT != MVT::i1)
       return false;
+
+    unsigned Arg = getRegForValue(*i);
+    if (Arg == 0)
+      return false;
+
     unsigned OriginalAlignment = TD.getABITypeAlignment(ArgTy);
     Flags.setOrigAlign(OriginalAlignment);
 
diff --git a/lib/Target/ARM/ARMISelLowering.cpp b/lib/Target/ARM/ARMISelLowering.cpp
index b55ef70..8c4c06f 100644
--- a/lib/Target/ARM/ARMISelLowering.cpp
+++ b/lib/Target/ARM/ARMISelLowering.cpp
@@ -40,7 +40,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/MC/MCSectionMachO.h"
 #include "llvm/Target/TargetOptions.h"
@@ -687,7 +686,6 @@ ARMTargetLowering::ARMTargetLowering(TargetMachine &TM)
   if (Subtarget->isTargetDarwin()) {
     setOperationAction(ISD::EH_SJLJ_SETJMP, MVT::i32, Custom);
     setOperationAction(ISD::EH_SJLJ_LONGJMP, MVT::Other, Custom);
-    setOperationAction(ISD::EH_SJLJ_DISPATCHSETUP, MVT::Other, Custom);
     setLibcallName(RTLIB::UNWIND_RESUME, "_Unwind_SjLj_Resume");
   }
 
@@ -864,7 +862,6 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
 
   case ARMISD::EH_SJLJ_SETJMP: return "ARMISD::EH_SJLJ_SETJMP";
   case ARMISD::EH_SJLJ_LONGJMP:return "ARMISD::EH_SJLJ_LONGJMP";
-  case ARMISD::EH_SJLJ_DISPATCHSETUP:return "ARMISD::EH_SJLJ_DISPATCHSETUP";
 
   case ARMISD::TC_RETURN:     return "ARMISD::TC_RETURN";
 
@@ -912,6 +909,7 @@ const char *ARMTargetLowering::getTargetNodeName(unsigned Opcode) const {
   case ARMISD::VGETLANEs:     return "ARMISD::VGETLANEs";
   case ARMISD::VMOVIMM:       return "ARMISD::VMOVIMM";
   case ARMISD::VMVNIMM:       return "ARMISD::VMVNIMM";
+  case ARMISD::VMOVFPIMM:     return "ARMISD::VMOVFPIMM";
   case ARMISD::VDUP:          return "ARMISD::VDUP";
   case ARMISD::VDUPLANE:      return "ARMISD::VDUPLANE";
   case ARMISD::VEXT:          return "ARMISD::VEXT";
@@ -2212,14 +2210,6 @@ SDValue ARMTargetLowering::LowerGLOBAL_OFFSET_TABLE(SDValue Op,
 }
 
 SDValue
-ARMTargetLowering::LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG)
-  const {
-  DebugLoc dl = Op.getDebugLoc();
-  return DAG.getNode(ARMISD::EH_SJLJ_DISPATCHSETUP, dl, MVT::Other,
-                     Op.getOperand(0), Op.getOperand(1));
-}
-
-SDValue
 ARMTargetLowering::LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
   SDValue Val = DAG.getConstant(0, MVT::i32);
@@ -3986,6 +3976,16 @@ SDValue ARMTargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG,
         SDValue Vmov = DAG.getNode(ARMISD::VMVNIMM, dl, VmovVT, Val);
         return DAG.getNode(ISD::BITCAST, dl, VT, Vmov);
       }
+
+      // Use vmov.f32 to materialize other v2f32 and v4f32 splats.
+      if (VT == MVT::v2f32 || VT == MVT::v4f32) {
+        ConstantFPSDNode *C = cast<ConstantFPSDNode>(Op.getOperand(0));
+        int ImmVal = ARM_AM::getFP32Imm(C->getValueAPF());
+        if (ImmVal != -1) {
+          SDValue Val = DAG.getTargetConstant(ImmVal, MVT::i32);
+          return DAG.getNode(ARMISD::VMOVFPIMM, dl, VT, Val);
+        }
+      }
     }
   }
 
@@ -5014,7 +5014,6 @@ SDValue ARMTargetLowering::LowerOperation(SDValue Op, SelectionDAG &DAG) const {
   case ISD::GLOBAL_OFFSET_TABLE: return LowerGLOBAL_OFFSET_TABLE(Op, DAG);
   case ISD::EH_SJLJ_SETJMP: return LowerEH_SJLJ_SETJMP(Op, DAG);
   case ISD::EH_SJLJ_LONGJMP: return LowerEH_SJLJ_LONGJMP(Op, DAG);
-  case ISD::EH_SJLJ_DISPATCHSETUP: return LowerEH_SJLJ_DISPATCHSETUP(Op, DAG);
   case ISD::INTRINSIC_WO_CHAIN: return LowerINTRINSIC_WO_CHAIN(Op, DAG,
                                                                Subtarget);
   case ISD::BITCAST:       return ExpandBITCAST(Op.getNode(), DAG);
@@ -5556,52 +5555,6 @@ ARMTargetLowering::EmitAtomicBinary64(MachineInstr *MI, MachineBasicBlock *BB,
   return BB;
 }
 
-/// EmitBasePointerRecalculation - For functions using a base pointer, we
-/// rematerialize it (via the frame pointer).
-void ARMTargetLowering::
-EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB,
-                             MachineBasicBlock *DispatchBB) const {
-  const TargetInstrInfo *TII = getTargetMachine().getInstrInfo();
-  const ARMBaseInstrInfo *AII = static_cast<const ARMBaseInstrInfo*>(TII);
-  MachineFunction &MF = *MI->getParent()->getParent();
-  ARMFunctionInfo *AFI = MF.getInfo<ARMFunctionInfo>();
-  const ARMBaseRegisterInfo &RI = AII->getRegisterInfo();
-
-  if (!RI.hasBasePointer(MF)) return;
-
-  MachineBasicBlock::iterator MBBI = MI;
-
-  int32_t NumBytes = AFI->getFramePtrSpillOffset();
-  unsigned FramePtr = RI.getFrameRegister(MF);
-  assert(MF.getTarget().getFrameLowering()->hasFP(MF) &&
-         "Base pointer without frame pointer?");
-
-  if (AFI->isThumb2Function())
-    llvm::emitT2RegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
-                                 FramePtr, -NumBytes, ARMCC::AL, 0, *AII);
-  else if (AFI->isThumbFunction())
-    llvm::emitThumbRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
-                                    FramePtr, -NumBytes, *AII, RI);
-  else
-    llvm::emitARMRegPlusImmediate(*MBB, MBBI, MI->getDebugLoc(), ARM::R6,
-                                  FramePtr, -NumBytes, ARMCC::AL, 0, *AII);
-
-  if (!RI.needsStackRealignment(MF)) return;
-
-  // If there's dynamic realignment, adjust for it.
-  MachineFrameInfo *MFI = MF.getFrameInfo();
-  unsigned MaxAlign = MFI->getMaxAlignment();
-  assert(!AFI->isThumb1OnlyFunction());
-
-  // Emit bic r6, r6, MaxAlign
-  unsigned bicOpc = AFI->isThumbFunction() ? ARM::t2BICri : ARM::BICri;
-  AddDefaultCC(
-    AddDefaultPred(
-      BuildMI(*MBB, MBBI, MI->getDebugLoc(), TII->get(bicOpc), ARM::R6)
-      .addReg(ARM::R6, RegState::Kill)
-      .addImm(MaxAlign - 1)));
-}
-
 /// SetupEntryBlockForSjLj - Insert code into the entry block that creates and
 /// registers the function context.
 void ARMTargetLowering::
@@ -5636,8 +5589,6 @@ SetupEntryBlockForSjLj(MachineInstr *MI, MachineBasicBlock *MBB,
     MF->getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                              MachineMemOperand::MOStore, 4, 4);
 
-  EmitBasePointerRecalculation(MI, MBB, DispatchBB);
-
   // Load the address of the dispatch MBB into the jump buffer.
   if (isThumb2) {
     // Incoming value: jbuf
@@ -5811,6 +5762,8 @@ EmitSjLjDispatchBlock(MachineInstr *MI, MachineBasicBlock *MBB) const {
                              MachineMemOperand::MOLoad |
                              MachineMemOperand::MOVolatile, 4, 4);
 
+  BuildMI(DispatchBB, dl, TII->get(ARM::eh_sjlj_dispatchsetup));
+
   unsigned NumLPads = LPadList.size();
   if (Subtarget->isThumb2()) {
     unsigned NewVReg1 = MRI->createVirtualRegister(TRC);
diff --git a/lib/Target/ARM/ARMISelLowering.h b/lib/Target/ARM/ARMISelLowering.h
index be6a530..b8dc4bf 100644
--- a/lib/Target/ARM/ARMISelLowering.h
+++ b/lib/Target/ARM/ARMISelLowering.h
@@ -81,7 +81,6 @@ namespace llvm {
 
       EH_SJLJ_SETJMP,         // SjLj exception handling setjmp.
       EH_SJLJ_LONGJMP,        // SjLj exception handling longjmp.
-      EH_SJLJ_DISPATCHSETUP,  // SjLj exception handling dispatch setup.
 
       TC_RETURN,    // Tail call return pseudo.
 
@@ -146,6 +145,9 @@ namespace llvm {
       VMOVIMM,
       VMVNIMM,
 
+      // Vector move f32 immediate:
+      VMOVFPIMM,
+
       // Vector duplicate:
       VDUP,
       VDUPLANE,
@@ -407,7 +409,6 @@ namespace llvm {
                              ISD::ArgFlagsTy Flags) const;
     SDValue LowerEH_SJLJ_SETJMP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerEH_SJLJ_LONGJMP(SDValue Op, SelectionDAG &DAG) const;
-    SDValue LowerEH_SJLJ_DISPATCHSETUP(SDValue Op, SelectionDAG &DAG) const;
     SDValue LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG,
                                     const ARMSubtarget *Subtarget) const;
     SDValue LowerBlockAddress(SDValue Op, SelectionDAG &DAG) const;
@@ -517,9 +518,6 @@ namespace llvm {
                                                bool signExtend,
                                                ARMCC::CondCodes Cond) const;
 
-    void EmitBasePointerRecalculation(MachineInstr *MI, MachineBasicBlock *MBB,
-                                      MachineBasicBlock *DispatchBB) const;
-
     void SetupEntryBlockForSjLj(MachineInstr *MI,
                                 MachineBasicBlock *MBB,
                                 MachineBasicBlock *DispatchBB, int FI) const;
diff --git a/lib/Target/ARM/ARMInstrFormats.td b/lib/Target/ARM/ARMInstrFormats.td
index 06ee2c8..6940156 100644
--- a/lib/Target/ARM/ARMInstrFormats.td
+++ b/lib/Target/ARM/ARMInstrFormats.td
@@ -2041,9 +2041,26 @@ multiclass VFPDT64InstAlias<string opc, string asm, dag Result> {
   def _64 : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
   defm : VFPDT64ReqInstAlias<opc, asm, Result>;
 }
+multiclass VFPDT64NoF64ReqInstAlias<string opc, string asm, dag Result> {
+  def I64 : VFPDataTypeInstAlias<opc, ".i64", asm, Result>;
+  def S64 : VFPDataTypeInstAlias<opc, ".s64", asm, Result>;
+  def U64 : VFPDataTypeInstAlias<opc, ".u64", asm, Result>;
+  def D   : VFPDataTypeInstAlias<opc, ".d",   asm, Result>;
+}
+// VFPDT64ReqInstAlias plus plain ".64"
+multiclass VFPDT64NoF64InstAlias<string opc, string asm, dag Result> {
+  def _64 : VFPDataTypeInstAlias<opc, ".64", asm, Result>;
+  defm : VFPDT64ReqInstAlias<opc, asm, Result>;
+}
 multiclass VFPDTAnyInstAlias<string opc, string asm, dag Result> {
   defm : VFPDT8InstAlias<opc, asm, Result>;
   defm : VFPDT16InstAlias<opc, asm, Result>;
   defm : VFPDT32InstAlias<opc, asm, Result>;
   defm : VFPDT64InstAlias<opc, asm, Result>;
 }
+multiclass VFPDTAnyNoF64InstAlias<string opc, string asm, dag Result> {
+  defm : VFPDT8InstAlias<opc, asm, Result>;
+  defm : VFPDT16InstAlias<opc, asm, Result>;
+  defm : VFPDT32InstAlias<opc, asm, Result>;
+  defm : VFPDT64NoF64InstAlias<opc, asm, Result>;
+}
diff --git a/lib/Target/ARM/ARMInstrInfo.td b/lib/Target/ARM/ARMInstrInfo.td
index 770703c..be03924 100644
--- a/lib/Target/ARM/ARMInstrInfo.td
+++ b/lib/Target/ARM/ARMInstrInfo.td
@@ -58,8 +58,6 @@ def SDT_ARMEH_SJLJ_Setjmp : SDTypeProfile<1, 2, [SDTCisInt<0>, SDTCisPtrTy<1>,
                                                  SDTCisInt<2>]>;
 def SDT_ARMEH_SJLJ_Longjmp: SDTypeProfile<0, 2, [SDTCisPtrTy<0>, SDTCisInt<1>]>;
 
-def SDT_ARMEH_SJLJ_DispatchSetup: SDTypeProfile<0, 1, [SDTCisInt<0>]>;
-
 def SDT_ARMMEMBARRIER     : SDTypeProfile<0, 1, [SDTCisInt<0>]>;
 
 def SDT_ARMPREFETCH : SDTypeProfile<0, 3, [SDTCisPtrTy<0>, SDTCisSameAs<1, 2>,
@@ -143,9 +141,6 @@ def ARMeh_sjlj_setjmp: SDNode<"ARMISD::EH_SJLJ_SETJMP",
                                SDT_ARMEH_SJLJ_Setjmp, [SDNPHasChain]>;
 def ARMeh_sjlj_longjmp: SDNode<"ARMISD::EH_SJLJ_LONGJMP",
                                SDT_ARMEH_SJLJ_Longjmp, [SDNPHasChain]>;
-def ARMeh_sjlj_dispatchsetup: SDNode<"ARMISD::EH_SJLJ_DISPATCHSETUP",
-                               SDT_ARMEH_SJLJ_DispatchSetup, [SDNPHasChain]>;
-
 
 def ARMMemBarrier     : SDNode<"ARMISD::MEMBARRIER", SDT_ARMMEMBARRIER,
                                [SDNPHasChain]>;
@@ -475,6 +470,7 @@ def shift_so_reg_reg : Operand<i32>,    // reg reg imm
   let EncoderMethod = "getSORegRegOpValue";
   let PrintMethod = "printSORegRegOperand";
   let DecoderMethod = "DecodeSORegRegOperand";
+  let ParserMatchClass = ShiftedRegAsmOperand;
   let MIOperandInfo = (ops GPR, GPR, i32imm);
 }
 
@@ -485,6 +481,7 @@ def shift_so_reg_imm : Operand<i32>,    // reg reg imm
   let EncoderMethod = "getSORegImmOpValue";
   let PrintMethod = "printSORegImmOperand";
   let DecoderMethod = "DecodeSORegImmOperand";
+  let ParserMatchClass = ShiftedImmAsmOperand;
   let MIOperandInfo = (ops GPR, i32imm);
 }
 
@@ -1555,7 +1552,7 @@ PseudoInst<(outs), (ins i32imm:$amt, pred:$p), NoItinerary,
 }
 
 // Atomic pseudo-insts which will be lowered to ldrexd/strexd loops.
-// (These psuedos use a hand-written selection code).
+// (These pseudos use a hand-written selection code).
 let usesCustomInserter = 1, Defs = [CPSR], mayLoad = 1, mayStore = 1 in {
 def ATOMOR6432   : PseudoInst<(outs GPR:$dst1, GPR:$dst2),
                               (ins GPR:$addr, GPR:$src1, GPR:$src2),
@@ -4673,11 +4670,8 @@ def Int_eh_sjlj_longjmp : PseudoInst<(outs), (ins GPR:$src, GPR:$scratch),
 // This pseudo is used for ARM, Thumb1 and Thumb2. Any differences are
 // handled when the pseudo is expanded (which happens before any passes
 // that need the instruction size).
-let isBarrier = 1, hasSideEffects = 1 in
-def Int_eh_sjlj_dispatchsetup :
- PseudoInst<(outs), (ins GPR:$src), NoItinerary,
-            [(ARMeh_sjlj_dispatchsetup GPR:$src)]>,
-              Requires<[IsDarwin]>;
+let isBarrier = 1 in
+def eh_sjlj_dispatchsetup : PseudoInst<(outs), (ins), NoItinerary, []>;
 
 //===----------------------------------------------------------------------===//
 // Non-Instruction Patterns
@@ -5023,3 +5017,43 @@ def LSLi : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rm, $imm",
 def RORi : ARMAsmPseudo<"ror${s}${p} $Rd, $Rm, $imm",
                         (ins GPR:$Rd, GPR:$Rm, imm0_31:$imm, pred:$p,
                              cc_out:$s)>;
+def RRXi : ARMAsmPseudo<"rrx${s}${p} $Rd, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rm, pred:$p, cc_out:$s)>;
+def ASRr : ARMAsmPseudo<"asr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSRr : ARMAsmPseudo<"lsr${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def LSLr : ARMAsmPseudo<"lsl${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+def RORr : ARMAsmPseudo<"ror${s}${p} $Rd, $Rn, $Rm",
+                        (ins GPRnopc:$Rd, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                             cc_out:$s)>;
+// shifter instructions also support a two-operand form.
+def : ARMInstAlias<"asr${s}${p} $Rm, $imm",
+                   (ASRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"lsr${s}${p} $Rm, $imm",
+                   (LSRi GPR:$Rm, GPR:$Rm, imm0_32:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"lsl${s}${p} $Rm, $imm",
+                   (LSLi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"ror${s}${p} $Rm, $imm",
+                   (RORi GPR:$Rm, GPR:$Rm, imm0_31:$imm, pred:$p, cc_out:$s)>;
+def : ARMInstAlias<"asr${s}${p} $Rn, $Rm",
+                   (ASRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                         cc_out:$s)>;
+def : ARMInstAlias<"lsr${s}${p} $Rn, $Rm",
+                   (LSRr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                         cc_out:$s)>;
+def : ARMInstAlias<"lsl${s}${p} $Rn, $Rm",
+                   (LSLr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                         cc_out:$s)>;
+def : ARMInstAlias<"ror${s}${p} $Rn, $Rm",
+                   (RORr GPRnopc:$Rn, GPRnopc:$Rn, GPRnopc:$Rm, pred:$p,
+                         cc_out:$s)>;
+
+
+// 'mul' instruction can be specified with only two operands.
+def : ARMInstAlias<"mul${s}${p} $Rn, $Rm",
+                   (MUL rGPR:$Rn, rGPR:$Rn, rGPR:$Rm, pred:$p, cc_out:$s)>;
diff --git a/lib/Target/ARM/ARMInstrNEON.td b/lib/Target/ARM/ARMInstrNEON.td
index 49cc254..f2ca963 100644
--- a/lib/Target/ARM/ARMInstrNEON.td
+++ b/lib/Target/ARM/ARMInstrNEON.td
@@ -39,6 +39,10 @@ def nImmVMOVI32 : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
   let ParserMatchClass = nImmVMOVI32AsmOperand;
 }
+def nImmVMOVF32 : Operand<i32> {
+  let PrintMethod = "printFPImmOperand";
+  let ParserMatchClass = FPImmOperand;
+}
 def nImmSplatI64AsmOperand : AsmOperandClass { let Name = "NEONi64splat"; }
 def nImmSplatI64 : Operand<i32> {
   let PrintMethod = "printNEONModImmOperand";
@@ -173,6 +177,7 @@ def NEONvgetlanes : SDNode<"ARMISD::VGETLANEs", SDTARMVGETLN>;
 def SDTARMVMOVIMM : SDTypeProfile<1, 1, [SDTCisVec<0>, SDTCisVT<1, i32>]>;
 def NEONvmovImm   : SDNode<"ARMISD::VMOVIMM", SDTARMVMOVIMM>;
 def NEONvmvnImm   : SDNode<"ARMISD::VMVNIMM", SDTARMVMOVIMM>;
+def NEONvmovFPImm : SDNode<"ARMISD::VMOVFPIMM", SDTARMVMOVIMM>;
 
 def SDTARMVORRIMM : SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0, 1>,
                                            SDTCisVT<2, i32>]>;
@@ -4464,6 +4469,10 @@ def : InstAlias<"vmov${p} $Vd, $Vm",
                 (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
 def : InstAlias<"vmov${p} $Vd, $Vm",
                 (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
+defm : VFPDTAnyNoF64InstAlias<"vmov${p}", "$Vd, $Vm",
+                              (VORRd DPR:$Vd, DPR:$Vm, DPR:$Vm, pred:$p)>;
+defm : VFPDTAnyNoF64InstAlias<"vmov${p}", "$Vd, $Vm",
+                              (VORRq QPR:$Vd, QPR:$Vm, QPR:$Vm, pred:$p)>;
 
 //   VMOV     : Vector Move (Immediate)
 
@@ -4513,6 +4522,15 @@ def VMOVv2i64 : N1ModImm<1, 0b000, 0b1110, 0, 1, 1, 1, (outs QPR:$Vd),
                          (ins nImmSplatI64:$SIMM), IIC_VMOVImm,
                          "vmov", "i64", "$Vd, $SIMM", "",
                          [(set QPR:$Vd, (v2i64 (NEONvmovImm timm:$SIMM)))]>;
+
+def VMOVv2f32 : N1ModImm<1, 0b000, 0b1111, 0, 0, 0, 1, (outs DPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set DPR:$Vd, (v2f32 (NEONvmovFPImm timm:$SIMM)))]>;
+def VMOVv4f32 : N1ModImm<1, 0b000, 0b1111, 0, 1, 0, 1, (outs QPR:$Vd),
+                         (ins nImmVMOVF32:$SIMM), IIC_VMOVImm,
+                         "vmov", "f32", "$Vd, $SIMM", "",
+                         [(set QPR:$Vd, (v4f32 (NEONvmovFPImm timm:$SIMM)))]>;
 } // isReMaterializable
 
 //   VMOV     : Vector Get Lane (move scalar to ARM core register)
@@ -4801,6 +4819,7 @@ def  VCVTu2fq : N2VQ<0b11, 0b11, 0b10, 0b11, 0b01101, 0, "vcvt", "f32.u32",
                      v4f32, v4i32, uint_to_fp>;
 
 //   VCVT     : Vector Convert Between Floating-Point and Fixed-Point.
+let DecoderMethod = "DecodeVCVTD" in {
 def VCVTf2xsd : N2VCvtD<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
                         v2i32, v2f32, int_arm_neon_vcvtfp2fxs>;
 def VCVTf2xud : N2VCvtD<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
@@ -4809,7 +4828,9 @@ def VCVTxs2fd : N2VCvtD<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
                         v2f32, v2i32, int_arm_neon_vcvtfxs2fp>;
 def VCVTxu2fd : N2VCvtD<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v2f32, v2i32, int_arm_neon_vcvtfxu2fp>;
+}
 
+let DecoderMethod = "DecodeVCVTQ" in {
 def VCVTf2xsq : N2VCvtQ<0, 1, 0b1111, 0, 1, "vcvt", "s32.f32",
                         v4i32, v4f32, int_arm_neon_vcvtfp2fxs>;
 def VCVTf2xuq : N2VCvtQ<1, 1, 0b1111, 0, 1, "vcvt", "u32.f32",
@@ -4818,6 +4839,7 @@ def VCVTxs2fq : N2VCvtQ<0, 1, 0b1110, 0, 1, "vcvt", "f32.s32",
                         v4f32, v4i32, int_arm_neon_vcvtfxs2fp>;
 def VCVTxu2fq : N2VCvtQ<1, 1, 0b1110, 0, 1, "vcvt", "f32.u32",
                         v4f32, v4i32, int_arm_neon_vcvtfxu2fp>;
+}
 
 //   VCVT     : Vector Convert Between Half-Precision and Single-Precision.
 def  VCVTf2h  : N2VNInt<0b11, 0b11, 0b01, 0b10, 0b01100, 0, 0,
@@ -5218,6 +5240,19 @@ defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
           (VLD1d32wb_fixed VecListOneD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
 defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
           (VLD1d64wb_fixed VecListOneD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
+//        with writeback, register stride
+defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1d8wb_register  VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1d16wb_register VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1d32wb_register VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
+defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1d64wb_register VecListOneD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
 
 // Load two D registers.
 defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn",
@@ -5237,6 +5272,19 @@ defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
           (VLD1q32wb_fixed VecListTwoD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
 defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
           (VLD1q64wb_fixed VecListTwoD:$Vd, zero_reg, addrmode6:$Rn, pred:$p)>;
+//        with writeback, register stride
+defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1q8wb_register  VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1q16wb_register VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1q32wb_register VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
+defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+          (VLD1q64wb_register VecListTwoD:$Vd, zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, pred:$p)>;
 
 // Load three D registers.
 defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn",
@@ -5260,6 +5308,19 @@ defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
 defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
                           (VLD1d64Twb_fixed VecListThreeD:$Vd, zero_reg,
                                             addrmode6:$Rn, pred:$p)>;
+//        with writeback, register stride
+defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                        (VLD1d8Twb_register VecListThreeD:$Vd, zero_reg,
+                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                        (VLD1d16Twb_register VecListThreeD:$Vd, zero_reg,
+                                             addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                        (VLD1d32Twb_register VecListThreeD:$Vd, zero_reg,
+                                             addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
+defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                        (VLD1d64Twb_register VecListThreeD:$Vd, zero_reg,
+                                             addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
 
 
 // Load four D registers.
@@ -5284,6 +5345,19 @@ defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
 defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn!",
                           (VLD1d64Qwb_fixed VecListFourD:$Vd, zero_reg,
                                             addrmode6:$Rn, pred:$p)>;
+//        with writeback, register stride
+defm : VFPDT8ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                       (VLD1d8Qwb_register VecListFourD:$Vd, zero_reg,
+                                           addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                       (VLD1d16Qwb_register VecListFourD:$Vd, zero_reg,
+                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                       (VLD1d32Qwb_register VecListFourD:$Vd, zero_reg,
+                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
+defm : VFPDT64ReqInstAlias<"vld1${p}", "$Vd, $Rn, $Rm",
+                       (VLD1d64Qwb_register VecListFourD:$Vd, zero_reg,
+                                            addrmode6:$Rn, rGPR:$Rm, pred:$p)>;
 
 // VST1 requires a size suffix, but also accepts type specific variants.
 // Store one D register.
@@ -5304,6 +5378,19 @@ defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
           (VST1d32wb_fixed zero_reg, addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
 defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
           (VST1d64wb_fixed zero_reg, addrmode6:$Rn, VecListOneD:$Vd, pred:$p)>;
+//        with writeback, register stride
+defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1d8wb_register  zero_reg, addrmode6:$Rn, rGPR:$Rm,
+                              VecListOneD:$Vd, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1d16wb_register zero_reg, addrmode6:$Rn, rGPR:$Rm,
+                              VecListOneD:$Vd, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1d32wb_register zero_reg, addrmode6:$Rn, rGPR:$Rm,
+                              VecListOneD:$Vd, pred:$p)>;
+defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1d64wb_register zero_reg, addrmode6:$Rn, rGPR:$Rm,
+                              VecListOneD:$Vd, pred:$p)>;
 
 // Store two D registers.
 defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn",
@@ -5323,6 +5410,19 @@ defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
           (VST1q32wb_fixed zero_reg, addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
 defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
           (VST1q64wb_fixed zero_reg, addrmode6:$Rn, VecListTwoD:$Vd, pred:$p)>;
+//        with writeback, register stride
+defm : VFPDT8ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1q8wb_register  zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1q16wb_register zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1q32wb_register zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
+defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn, $Rm",
+          (VST1q64wb_register zero_reg, addrmode6:$Rn,
+                              rGPR:$Rm, VecListTwoD:$Vd, pred:$p)>;
 
 // FIXME: The three and four register VST1 instructions haven't been moved
 // to the VecList* encoding yet, so we can't do assembly parsing support
@@ -5346,3 +5446,19 @@ defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn!",
 //                          (VST1d32Q addrmode6:$Rn, VecListFourD:$Vd, pred:$p)>;
 //defm : VFPDT64ReqInstAlias<"vst1${p}", "$Vd, $Rn",
 //                          (VST1d64Q addrmode6:$Rn, VecListFourD:$Vd, pred:$p)>;
+
+
+// VTRN instructions data type suffix aliases for more-specific types.
+defm : VFPDT8ReqInstAlias <"vtrn${p}", "$Dd, $Dm",
+                          (VTRNd8 DPR:$Dd, DPR:$Dm, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vtrn${p}", "$Dd, $Dm",
+                          (VTRNd16 DPR:$Dd, DPR:$Dm, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vtrn${p}", "$Dd, $Dm",
+                          (VTRNd32 DPR:$Dd, DPR:$Dm, pred:$p)>;
+
+defm : VFPDT8ReqInstAlias <"vtrn${p}", "$Qd, $Qm",
+                          (VTRNq8 QPR:$Qd, QPR:$Qm, pred:$p)>;
+defm : VFPDT16ReqInstAlias<"vtrn${p}", "$Qd, $Qm",
+                          (VTRNq16 QPR:$Qd, QPR:$Qm, pred:$p)>;
+defm : VFPDT32ReqInstAlias<"vtrn${p}", "$Qd, $Qm",
+                          (VTRNq32 QPR:$Qd, QPR:$Qm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrThumb2.td b/lib/Target/ARM/ARMInstrThumb2.td
index 03077c0..6129fa3 100644
--- a/lib/Target/ARM/ARMInstrThumb2.td
+++ b/lib/Target/ARM/ARMInstrThumb2.td
@@ -4084,3 +4084,8 @@ def : t2InstAlias<"sxth${p} $Rd, $Rm$rot",
 // for isel.
 def : t2InstAlias<"mov${p} $Rd, $imm",
                   (t2MVNi rGPR:$Rd, t2_so_imm_not:$imm, pred:$p, zero_reg)>;
+
+
+// Wide 'mul' encoding can be specified with only two operands.
+def : t2InstAlias<"mul${p} $Rn, $Rm",
+                  (t2MUL rGPR:$Rn, rGPR:$Rn, rGPR:$Rm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMInstrVFP.td b/lib/Target/ARM/ARMInstrVFP.td
index 488c508..e420135 100644
--- a/lib/Target/ARM/ARMInstrVFP.td
+++ b/lib/Target/ARM/ARMInstrVFP.td
@@ -1172,3 +1172,35 @@ defm : VFPDT64InstAlias<"vldr${p}", "$Dd, $addr",
                         (VLDRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
 defm : VFPDT64InstAlias<"vstr${p}", "$Dd, $addr",
                         (VSTRD DPR:$Dd, addrmode5:$addr, pred:$p)>;
+
+// VMUL has a two-operand form (implied destination operand)
+def : VFP2InstAlias<"vmul${p}.f64 $Dn, $Dm",
+                    (VMULD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"vmul${p}.f32 $Sn, $Sm",
+                    (VMULS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
+// VADD has a two-operand form (implied destination operand)
+def : VFP2InstAlias<"vadd${p}.f64 $Dn, $Dm",
+                    (VADDD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"vadd${p}.f32 $Sn, $Sm",
+                    (VADDS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
+// VSUB has a two-operand form (implied destination operand)
+def : VFP2InstAlias<"vsub${p}.f64 $Dn, $Dm",
+                    (VSUBD DPR:$Dn, DPR:$Dn, DPR:$Dm, pred:$p)>;
+def : VFP2InstAlias<"vsub${p}.f32 $Sn, $Sm",
+                    (VSUBS SPR:$Sn, SPR:$Sn, SPR:$Sm, pred:$p)>;
+
+// VMOV can accept optional .f32/.f64 suffix.
+def : VFP2InstAlias<"vmov${p}.f32 $Rt, $Sn",
+                    (VMOVRS GPR:$Rt, SPR:$Sn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.f32 $Sn, $Rt",
+                    (VMOVSR SPR:$Sn, GPR:$Rt, pred:$p)>;
+
+def : VFP2InstAlias<"vmov${p}.f64 $Rt, $Rt2, $Dn",
+                    (VMOVRRD GPR:$Rt, GPR:$Rt2, DPR:$Dn, pred:$p)>;
+def : VFP2InstAlias<"vmov${p}.f64 $Dn, $Rt, $Rt2",
+                    (VMOVDRR DPR:$Dn, GPR:$Rt, GPR:$Rt2, pred:$p)>;
+
+// VMOVS doesn't need the .f32 to disambiguate from the NEON encoding the way
+// VMOVD does.
+def : VFP2InstAlias<"vmov${p} $Sd, $Sm",
+                    (VMOVS SPR:$Sd, SPR:$Sm, pred:$p)>;
diff --git a/lib/Target/ARM/ARMTargetMachine.cpp b/lib/Target/ARM/ARMTargetMachine.cpp
index cf1432d..6cbb24b 100644
--- a/lib/Target/ARM/ARMTargetMachine.cpp
+++ b/lib/Target/ARM/ARMTargetMachine.cpp
@@ -38,8 +38,9 @@ extern "C" void LLVMInitializeARMTarget() {
 ///
 ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
                                            StringRef CPU, StringRef FS,
-                                           Reloc::Model RM, CodeModel::Model CM)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+                                           Reloc::Model RM, CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     JITInfo(),
     InstrItins(Subtarget.getInstrItineraryData()) {
@@ -50,8 +51,9 @@ ARMBaseTargetMachine::ARMBaseTargetMachine(const Target &T, StringRef TT,
 
 ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
-                                   Reloc::Model RM, CodeModel::Model CM)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM), InstrInfo(Subtarget),
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM, OL), InstrInfo(Subtarget),
     DataLayout(Subtarget.isAPCS_ABI() ?
                std::string("e-p:32:32-f64:32:64-i64:32:64-"
                            "v128:32:128-v64:32:64-n32-S32") :
@@ -71,8 +73,9 @@ ARMTargetMachine::ARMTargetMachine(const Target &T, StringRef TT,
 
 ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
-                                       Reloc::Model RM, CodeModel::Model CM)
-  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM),
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : ARMBaseTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     InstrInfo(Subtarget.hasThumb2()
               ? ((ARMBaseInstrInfo*)new Thumb2InstrInfo(Subtarget))
               : ((ARMBaseInstrInfo*)new Thumb1InstrInfo(Subtarget))),
@@ -95,34 +98,30 @@ ThumbTargetMachine::ThumbTargetMachine(const Target &T, StringRef TT,
               : (ARMFrameLowering*)new Thumb1FrameLowering(Subtarget)) {
 }
 
-bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM,
-                                      CodeGenOpt::Level OptLevel) {
-  if (OptLevel != CodeGenOpt::None && EnableGlobalMerge)
+bool ARMBaseTargetMachine::addPreISel(PassManagerBase &PM) {
+  if (getOptLevel() != CodeGenOpt::None && EnableGlobalMerge)
     PM.add(createGlobalMergePass(getTargetLowering()));
 
   return false;
 }
 
-bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM,
-                                           CodeGenOpt::Level OptLevel) {
-  PM.add(createARMISelDag(*this, OptLevel));
+bool ARMBaseTargetMachine::addInstSelector(PassManagerBase &PM) {
+  PM.add(createARMISelDag(*this, getOptLevel()));
   return false;
 }
 
-bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
+bool ARMBaseTargetMachine::addPreRegAlloc(PassManagerBase &PM) {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
-  if (OptLevel != CodeGenOpt::None && !Subtarget.isThumb1Only())
+  if (getOptLevel() != CodeGenOpt::None && !Subtarget.isThumb1Only())
     PM.add(createARMLoadStoreOptimizationPass(true));
-  if (OptLevel != CodeGenOpt::None && Subtarget.isCortexA9())
+  if (getOptLevel() != CodeGenOpt::None && Subtarget.isCortexA9())
     PM.add(createMLxExpansionPass());
   return true;
 }
 
-bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
-                                        CodeGenOpt::Level OptLevel) {
+bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM) {
   // FIXME: temporarily disabling load / store optimization pass for Thumb1.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     if (!Subtarget.isThumb1Only())
       PM.add(createARMLoadStoreOptimizationPass());
     if (Subtarget.hasNEON())
@@ -133,7 +132,7 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
   // proper scheduling.
   PM.add(createARMExpandPseudoPass());
 
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     if (!Subtarget.isThumb1Only())
       PM.add(createIfConverterPass());
   }
@@ -143,8 +142,7 @@ bool ARMBaseTargetMachine::addPreSched2(PassManagerBase &PM,
   return true;
 }
 
-bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
+bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM) {
   if (Subtarget.isThumb2() && !Subtarget.prefers32BitThumb())
     PM.add(createThumb2SizeReductionPass());
 
@@ -153,7 +151,6 @@ bool ARMBaseTargetMachine::addPreEmitPass(PassManagerBase &PM,
 }
 
 bool ARMBaseTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel,
                                           JITCodeEmitter &JCE) {
   // Machine code emitter pass for ARM.
   PM.add(createARMJITCodeEmitterPass(*this, JCE));
diff --git a/lib/Target/ARM/ARMTargetMachine.h b/lib/Target/ARM/ARMTargetMachine.h
index c8c601c..a1f517b 100644
--- a/lib/Target/ARM/ARMTargetMachine.h
+++ b/lib/Target/ARM/ARMTargetMachine.h
@@ -41,7 +41,8 @@ private:
 public:
   ARMBaseTargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
-                       Reloc::Model RM, CodeModel::Model CM);
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
 
   virtual       ARMJITInfo       *getJITInfo()         { return &JITInfo; }
   virtual const ARMSubtarget  *getSubtargetImpl() const { return &Subtarget; }
@@ -50,13 +51,12 @@ public:
   }
 
   // Pass Pipeline Configuration
-  virtual bool addPreISel(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreSched2(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
-                              JITCodeEmitter &MCE);
+  virtual bool addPreISel(PassManagerBase &PM);
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreRegAlloc(PassManagerBase &PM);
+  virtual bool addPreSched2(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM, JITCodeEmitter &MCE);
 };
 
 /// ARMTargetMachine - ARM target machine.
@@ -71,7 +71,8 @@ class ARMTargetMachine : public ARMBaseTargetMachine {
  public:
   ARMTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS,
-                   Reloc::Model RM, CodeModel::Model CM);
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
 
   virtual const ARMRegisterInfo  *getRegisterInfo() const {
     return &InstrInfo.getRegisterInfo();
@@ -111,7 +112,8 @@ class ThumbTargetMachine : public ARMBaseTargetMachine {
 public:
   ThumbTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 
   /// returns either Thumb1RegisterInfo or Thumb2RegisterInfo
   virtual const ARMBaseRegisterInfo *getRegisterInfo() const {
diff --git a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 1d66d12..bb83e5e 100644
--- a/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1946,18 +1946,15 @@ void ARMOperand::print(raw_ostream &OS) const {
     break;
   case k_ShiftedRegister:
     OS << "<so_reg_reg "
-       << RegShiftedReg.SrcReg
-       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedReg.ShiftImm))
-       << ", " << RegShiftedReg.ShiftReg << ", "
-       << ARM_AM::getSORegOffset(RegShiftedReg.ShiftImm)
-       << ">";
+       << RegShiftedReg.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedReg.ShiftTy)
+       << " " << RegShiftedReg.ShiftReg << ">";
     break;
   case k_ShiftedImmediate:
     OS << "<so_reg_imm "
-       << RegShiftedImm.SrcReg
-       << ARM_AM::getShiftOpcStr(ARM_AM::getSORegShOp(RegShiftedImm.ShiftImm))
-       << ", " << ARM_AM::getSORegOffset(RegShiftedImm.ShiftImm)
-       << ">";
+       << RegShiftedImm.SrcReg << " "
+       << ARM_AM::getShiftOpcStr(RegShiftedImm.ShiftTy)
+       << " #" << RegShiftedImm.ShiftImm << ">";
     break;
   case k_RotateImmediate:
     OS << "<ror " << " #" << (RotImm.Imm * 8) << ">";
@@ -2366,7 +2363,7 @@ static unsigned getDRegFromQReg(unsigned QReg) {
   case ARM::Q6:  return ARM::D12;
   case ARM::Q7:  return ARM::D14;
   case ARM::Q8:  return ARM::D16;
-  case ARM::Q9:  return ARM::D19;
+  case ARM::Q9:  return ARM::D18;
   case ARM::Q10: return ARM::D20;
   case ARM::Q11: return ARM::D22;
   case ARM::Q12: return ARM::D24;
@@ -2420,7 +2417,7 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   while (Parser.getTok().is(AsmToken::Comma) ||
          Parser.getTok().is(AsmToken::Minus)) {
     if (Parser.getTok().is(AsmToken::Minus)) {
-      Parser.Lex(); // Eat the comma.
+      Parser.Lex(); // Eat the minus.
       SMLoc EndLoc = Parser.getTok().getLoc();
       int EndReg = tryParseRegister();
       if (EndReg == -1)
@@ -2487,10 +2484,31 @@ parseRegisterList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
 // parse a vector register list
 ARMAsmParser::OperandMatchResultTy ARMAsmParser::
 parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
-  if(Parser.getTok().isNot(AsmToken::LCurly))
+  SMLoc S = Parser.getTok().getLoc();
+  // As an extension (to match gas), support a plain D register or Q register
+  // (without encosing curly braces) as a single or double entry list,
+  // respectively.
+  if (Parser.getTok().is(AsmToken::Identifier)) {
+    int Reg = tryParseRegister();
+    if (Reg == -1)
+      return MatchOperand_NoMatch;
+    SMLoc E = Parser.getTok().getLoc();
+    if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg)) {
+      Operands.push_back(ARMOperand::CreateVectorList(Reg, 1, S, E));
+      return MatchOperand_Success;
+    }
+    if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
+      Reg = getDRegFromQReg(Reg);
+      Operands.push_back(ARMOperand::CreateVectorList(Reg, 2, S, E));
+      return MatchOperand_Success;
+    }
+    Error(S, "vector register expected");
+    return MatchOperand_ParseFail;
+  }
+
+  if (Parser.getTok().isNot(AsmToken::LCurly))
     return MatchOperand_NoMatch;
 
-  SMLoc S = Parser.getTok().getLoc();
   Parser.Lex(); // Eat '{' token.
   SMLoc RegLoc = Parser.getTok().getLoc();
 
@@ -2509,7 +2527,39 @@ parseVectorList(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
     ++Count;
   }
 
-  while (Parser.getTok().is(AsmToken::Comma)) {
+  while (Parser.getTok().is(AsmToken::Comma) ||
+         Parser.getTok().is(AsmToken::Minus)) {
+    if (Parser.getTok().is(AsmToken::Minus)) {
+      Parser.Lex(); // Eat the minus.
+      SMLoc EndLoc = Parser.getTok().getLoc();
+      int EndReg = tryParseRegister();
+      if (EndReg == -1) {
+        Error(EndLoc, "register expected");
+        return MatchOperand_ParseFail;
+      }
+      // Allow Q regs and just interpret them as the two D sub-registers.
+      if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(EndReg))
+        EndReg = getDRegFromQReg(EndReg) + 1;
+      // If the register is the same as the start reg, there's nothing
+      // more to do.
+      if (Reg == EndReg)
+        continue;
+      // The register must be in the same register class as the first.
+      if (!ARMMCRegisterClasses[ARM::DPRRegClassID].contains(EndReg)) {
+        Error(EndLoc, "invalid register in register list");
+        return MatchOperand_ParseFail;
+      }
+      // Ranges must go from low to high.
+      if (Reg > EndReg) {
+        Error(EndLoc, "bad range in register list");
+        return MatchOperand_ParseFail;
+      }
+
+      // Add all the registers in the range to the register list.
+      Count += EndReg - Reg;
+      Reg = EndReg;
+      continue;
+    }
     Parser.Lex(); // Eat the comma.
     RegLoc = Parser.getTok().getLoc();
     int OldReg = Reg;
@@ -3538,9 +3588,12 @@ parseMemory(SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   }
 
   // If we have a '#', it's an immediate offset, else assume it's a register
-  // offset.
-  if (Parser.getTok().is(AsmToken::Hash)) {
-    Parser.Lex(); // Eat the '#'.
+  // offset. Be friendly and also accept a plain integer (without a leading
+  // hash) for gas compatibility.
+  if (Parser.getTok().is(AsmToken::Hash) ||
+      Parser.getTok().is(AsmToken::Integer)) {
+    if (Parser.getTok().is(AsmToken::Hash))
+      Parser.Lex(); // Eat the '#'.
     E = Parser.getTok().getLoc();
 
     bool isNegative = getParser().getTok().is(AsmToken::Minus);
@@ -4098,6 +4151,7 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
       // remove the cc_out operand.
       (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
        !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[5])->getReg()) ||
        !inITBlock() ||
        (static_cast<ARMOperand*>(Operands[3])->getReg() !=
         static_cast<ARMOperand*>(Operands[5])->getReg() &&
@@ -4105,6 +4159,20 @@ bool ARMAsmParser::shouldOmitCCOutOperand(StringRef Mnemonic,
         static_cast<ARMOperand*>(Operands[4])->getReg())))
     return true;
 
+  // Also check the 'mul' syntax variant that doesn't specify an explicit
+  // destination register.
+  if (isThumbTwo() && Mnemonic == "mul" && Operands.size() == 5 &&
+      static_cast<ARMOperand*>(Operands[1])->getReg() == 0 &&
+      static_cast<ARMOperand*>(Operands[3])->isReg() &&
+      static_cast<ARMOperand*>(Operands[4])->isReg() &&
+      // If the registers aren't low regs  or the cc_out operand is zero
+      // outside of an IT block, we have to use the 32-bit encoding, so
+      // remove the cc_out operand.
+      (!isARMLowRegister(static_cast<ARMOperand*>(Operands[3])->getReg()) ||
+       !isARMLowRegister(static_cast<ARMOperand*>(Operands[4])->getReg()) ||
+       !inITBlock()))
+    return true;
+
 
 
   // Register-register 'add/sub' for thumb does not have a cc_out operand
@@ -4542,12 +4610,37 @@ processInstruction(MCInst &Inst,
                    const SmallVectorImpl<MCParsedAsmOperand*> &Operands) {
   switch (Inst.getOpcode()) {
   // Handle the MOV complex aliases.
+  case ARM::ASRr:
+  case ARM::LSRr:
+  case ARM::LSLr:
+  case ARM::RORr: {
+    ARM_AM::ShiftOpc ShiftTy;
+    switch(Inst.getOpcode()) {
+    default: llvm_unreachable("unexpected opcode!");
+    case ARM::ASRr: ShiftTy = ARM_AM::asr; break;
+    case ARM::LSRr: ShiftTy = ARM_AM::lsr; break;
+    case ARM::LSLr: ShiftTy = ARM_AM::lsl; break;
+    case ARM::RORr: ShiftTy = ARM_AM::ror; break;
+    }
+    // A shift by zero is a plain MOVr, not a MOVsi.
+    unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsr);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(Inst.getOperand(2)); // Rm
+    TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(3)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(4));
+    TmpInst.addOperand(Inst.getOperand(5)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
   case ARM::ASRi:
   case ARM::LSRi:
   case ARM::LSLi:
   case ARM::RORi: {
     ARM_AM::ShiftOpc ShiftTy;
-    unsigned Amt = Inst.getOperand(2).getImm();
     switch(Inst.getOpcode()) {
     default: llvm_unreachable("unexpected opcode!");
     case ARM::ASRi: ShiftTy = ARM_AM::asr; break;
@@ -4556,6 +4649,7 @@ processInstruction(MCInst &Inst,
     case ARM::RORi: ShiftTy = ARM_AM::ror; break;
     }
     // A shift by zero is a plain MOVr, not a MOVsi.
+    unsigned Amt = Inst.getOperand(2).getImm();
     unsigned Opc = Amt == 0 ? ARM::MOVr : ARM::MOVsi;
     unsigned Shifter = ARM_AM::getSORegOpc(ShiftTy, Amt);
     MCInst TmpInst;
@@ -4570,6 +4664,19 @@ processInstruction(MCInst &Inst,
     Inst = TmpInst;
     return true;
   }
+  case ARM::RRXi: {
+    unsigned Shifter = ARM_AM::getSORegOpc(ARM_AM::rrx, 0);
+    MCInst TmpInst;
+    TmpInst.setOpcode(ARM::MOVsi);
+    TmpInst.addOperand(Inst.getOperand(0)); // Rd
+    TmpInst.addOperand(Inst.getOperand(1)); // Rn
+    TmpInst.addOperand(MCOperand::CreateImm(Shifter)); // Shift value and ty
+    TmpInst.addOperand(Inst.getOperand(2)); // CondCode
+    TmpInst.addOperand(Inst.getOperand(3));
+    TmpInst.addOperand(Inst.getOperand(4)); // cc_out
+    Inst = TmpInst;
+    return true;
+  }
   case ARM::t2LDMIA_UPD: {
     // If this is a load of a single register, then we should use
     // a post-indexed LDR instruction instead, per the ARM ARM.
diff --git a/lib/Target/ARM/CMakeLists.txt b/lib/Target/ARM/CMakeLists.txt
index baa55f2..511932e 100644
--- a/lib/Target/ARM/CMakeLists.txt
+++ b/lib/Target/ARM/CMakeLists.txt
@@ -62,8 +62,8 @@ add_llvm_library_dependencies(LLVMARMCodeGen
   LLVMTarget
   )
 
-# workaround for hanging compilation on MSVC10
-if( MSVC_VERSION EQUAL 1600 )
+# workaround for hanging compilation on MSVC9, 10
+if( MSVC_VERSION EQUAL 1600 OR MSVC_VERSION EQUAL 1500 )
 set_property(
   SOURCE ARMISelLowering.cpp
   PROPERTY COMPILE_FLAGS "/Od"
diff --git a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 0b9b5d0..ad250ab 100644
--- a/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -179,8 +179,6 @@ static DecodeStatus DecodeAddrMode7Operand(llvm::MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeBranchImmInstruction(llvm::MCInst &Inst,unsigned Insn,
                                uint64_t Address, const void *Decoder);
-static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val,
-                               uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeVLDInstruction(llvm::MCInst &Inst, unsigned Val,
@@ -251,6 +249,11 @@ static DecodeStatus DecodeVMOVRRS(llvm::MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
 static DecodeStatus DecodeSwap(llvm::MCInst &Inst, unsigned Insn,
                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTD(llvm::MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+static DecodeStatus DecodeVCVTQ(llvm::MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder);
+
 
 static DecodeStatus DecodeThumbAddSpecialReg(llvm::MCInst &Inst, uint16_t Insn,
                                uint64_t Address, const void *Decoder);
@@ -1921,12 +1924,6 @@ DecodeBranchImmInstruction(llvm::MCInst &Inst, unsigned Insn,
 }
 
 
-static DecodeStatus DecodeVCVTImmOperand(llvm::MCInst &Inst, unsigned Val,
-                                 uint64_t Address, const void *Decoder) {
-  Inst.addOperand(MCOperand::CreateImm(64 - Val));
-  return MCDisassembler::Success;
-}
-
 static DecodeStatus DecodeAddrMode6Operand(llvm::MCInst &Inst, unsigned Val,
                                    uint64_t Address, const void *Decoder) {
   DecodeStatus S = MCDisassembler::Success;
@@ -4085,3 +4082,60 @@ static DecodeStatus DecodeSwap(llvm::MCInst &Inst, unsigned Insn,
 
   return S;
 }
+
+static DecodeStatus DecodeVCVTD(llvm::MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned Vd = (fieldFromInstruction32(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction32(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction32(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction32(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction32(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction32(Insn, 8, 4);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // VMOVv2f32 is ambiguous with these decodings.
+  if (!(imm & 0x38) && cmode == 0xF) {
+    Inst.setOpcode(ARM::VMOVv2f32);
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeDPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(64 - imm));
+
+  return S;
+}
+
+static DecodeStatus DecodeVCVTQ(llvm::MCInst &Inst, unsigned Insn,
+                                uint64_t Address, const void *Decoder) {
+  unsigned Vd = (fieldFromInstruction32(Insn, 12, 4) << 0);
+  Vd |= (fieldFromInstruction32(Insn, 22, 1) << 4);
+  unsigned Vm = (fieldFromInstruction32(Insn, 0, 4) << 0);
+  Vm |= (fieldFromInstruction32(Insn, 5, 1) << 4);
+  unsigned imm = fieldFromInstruction32(Insn, 16, 6);
+  unsigned cmode = fieldFromInstruction32(Insn, 8, 4);
+
+  DecodeStatus S = MCDisassembler::Success;
+
+  // VMOVv4f32 is ambiguous with these decodings.
+  if (!(imm & 0x38) && cmode == 0xF) {
+    Inst.setOpcode(ARM::VMOVv4f32);
+    return DecodeNEONModImmInstruction(Inst, Insn, Address, Decoder);
+  }
+
+  if (!(imm & 0x20)) Check(S, MCDisassembler::SoftFail);
+
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vd, Address, Decoder)))
+    return MCDisassembler::Fail;
+  if (!Check(S, DecodeQPRRegisterClass(Inst, Vm, Address, Decoder)))
+    return MCDisassembler::Fail;
+  Inst.addOperand(MCOperand::CreateImm(64 - imm));
+
+  return S;
+}
+
diff --git a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
index 1bc585b..62d04c4 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMAsmBackend.cpp
@@ -60,7 +60,7 @@ public:
 // ARMFixupKinds.h.
 //
 // Name                      Offset (bits) Size (bits)     Flags
-{ "fixup_arm_ldst_pcrel_12", 1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_ldst_pcrel_12", 0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_t2_ldst_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel |
                                    MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
 { "fixup_arm_pcrel_10",      0,            32,  MCFixupKindInfo::FKF_IsPCRel },
@@ -68,7 +68,7 @@ public:
                                    MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
 { "fixup_thumb_adr_pcrel_10",0,            8,   MCFixupKindInfo::FKF_IsPCRel |
                                    MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
-{ "fixup_arm_adr_pcrel_12",  1,            24,  MCFixupKindInfo::FKF_IsPCRel },
+{ "fixup_arm_adr_pcrel_12",  0,            32,  MCFixupKindInfo::FKF_IsPCRel },
 { "fixup_t2_adr_pcrel_12",   0,            32,  MCFixupKindInfo::FKF_IsPCRel |
                                    MCFixupKindInfo::FKF_IsAlignedDownTo32Bits},
 { "fixup_arm_condbranch",    0,            24,  MCFixupKindInfo::FKF_IsPCRel },
@@ -138,7 +138,7 @@ bool ARMAsmBackend::WriteNopData(uint64_t Count, MCObjectWriter *OW) const {
   const uint16_t Thumb1_16bitNopEncoding = 0x46c0; // using MOV r8,r8
   const uint16_t Thumb2_16bitNopEncoding = 0xbf00; // NOP
   const uint32_t ARMv4_NopEncoding = 0xe1a0000; // using MOV r0,r0
-  const uint32_t ARMv6T2_NopEncoding = 0xe3207800; // NOP
+  const uint32_t ARMv6T2_NopEncoding = 0xe320f000; // NOP
   if (isThumb()) {
     const uint16_t nopEncoding = hasNOP() ? Thumb2_16bitNopEncoding
                                           : Thumb1_16bitNopEncoding;
diff --git a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
index 6042b11..e86f48e 100644
--- a/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
+++ b/lib/Target/ARM/MCTargetDesc/ARMMCTargetDesc.cpp
@@ -129,14 +129,15 @@ static MCAsmInfo *createARMMCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createARMMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM) {
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   if (RM == Reloc::Default) {
     Triple TheTriple(TT);
     // Default relocation model on Darwin is PIC, not DynamicNoPIC.
     RM = TheTriple.isOSDarwin() ? Reloc::PIC_ : Reloc::DynamicNoPIC;
   }
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/ARM/Thumb1InstrInfo.cpp b/lib/Target/ARM/Thumb1InstrInfo.cpp
index 218311d..de33bd6 100644
--- a/lib/Target/ARM/Thumb1InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb1InstrInfo.cpp
@@ -18,7 +18,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/ADT/SmallVector.h"
 #include "Thumb1InstrInfo.h"
 
@@ -60,8 +59,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                               MachineMemOperand::MOStore,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
@@ -89,8 +87,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                    MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                               MachineMemOperand::MOLoad,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
diff --git a/lib/Target/ARM/Thumb2InstrInfo.cpp b/lib/Target/ARM/Thumb2InstrInfo.cpp
index cf040c8..7ec3c0e 100644
--- a/lib/Target/ARM/Thumb2InstrInfo.cpp
+++ b/lib/Target/ARM/Thumb2InstrInfo.cpp
@@ -20,7 +20,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/CommandLine.h"
 
@@ -130,8 +129,7 @@ storeRegToStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                               MachineMemOperand::MOStore,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
@@ -158,8 +156,7 @@ loadRegFromStackSlot(MachineBasicBlock &MBB, MachineBasicBlock::iterator I,
     MachineFunction &MF = *MBB.getParent();
     MachineFrameInfo &MFI = *MF.getFrameInfo();
     MachineMemOperand *MMO =
-      MF.getMachineMemOperand(
-                      MachinePointerInfo(PseudoSourceValue::getFixedStack(FI)),
+      MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FI),
                               MachineMemOperand::MOLoad,
                               MFI.getObjectSize(FI),
                               MFI.getObjectAlignment(FI));
diff --git a/lib/Target/CBackend/CBackend.cpp b/lib/Target/CBackend/CBackend.cpp
index 06e812b..8bce52c 100644
--- a/lib/Target/CBackend/CBackend.cpp
+++ b/lib/Target/CBackend/CBackend.cpp
@@ -3604,7 +3604,6 @@ void CWriter::visitExtractValueInst(ExtractValueInst &EVI) {
 bool CTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                          formatted_raw_ostream &o,
                                          CodeGenFileType FileType,
-                                         CodeGenOpt::Level OptLevel,
                                          bool DisableVerify) {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
 
diff --git a/lib/Target/CBackend/CTargetMachine.h b/lib/Target/CBackend/CTargetMachine.h
index 4f1ca97..ca346af 100644
--- a/lib/Target/CBackend/CTargetMachine.h
+++ b/lib/Target/CBackend/CTargetMachine.h
@@ -22,13 +22,13 @@ namespace llvm {
 struct CTargetMachine : public TargetMachine {
   CTargetMachine(const Target &T, StringRef TT,
                  StringRef CPU, StringRef FS,
-                 Reloc::Model RM, CodeModel::Model CM)
+                 Reloc::Model RM, CodeModel::Model CM,
+                 CodeGenOpt::Level OL)
     : TargetMachine(T, TT, CPU, FS) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
                                    CodeGenFileType FileType,
-                                   CodeGenOpt::Level OptLevel,
                                    bool DisableVerify);
   
   virtual const TargetData *getTargetData() const { return 0; }
diff --git a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
index d5af2a8..5ce14c9 100644
--- a/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
+++ b/lib/Target/CellSPU/MCTargetDesc/SPUMCTargetDesc.cpp
@@ -62,11 +62,12 @@ static MCAsmInfo *createSPUMCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createSPUMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                      CodeModel::Model CM) {
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   // For the time being, use static relocations, since there's really no
   // support for PIC yet.
-  X->InitMCCodeGenInfo(Reloc::Static, CM);
+  X->InitMCCodeGenInfo(Reloc::Static, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
index 99837df..a851be3 100644
--- a/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
+++ b/lib/Target/CellSPU/SPUISelDAGToDAG.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Target/TargetOptions.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Constants.h"
diff --git a/lib/Target/CellSPU/SPUTargetMachine.cpp b/lib/Target/CellSPU/SPUTargetMachine.cpp
index 93a7f6e..6940316 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.cpp
+++ b/lib/Target/CellSPU/SPUTargetMachine.cpp
@@ -34,8 +34,9 @@ SPUFrameLowering::getCalleeSaveSpillSlots(unsigned &NumEntries) const {
 
 SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
-                                   Reloc::Model RM, CodeModel::Model CM)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+                                   Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     DataLayout(Subtarget.getTargetDataString()),
     InstrInfo(*this),
@@ -49,8 +50,7 @@ SPUTargetMachine::SPUTargetMachine(const Target &T, StringRef TT,
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
-bool SPUTargetMachine::addInstSelector(PassManagerBase &PM,
-                                       CodeGenOpt::Level OptLevel) {
+bool SPUTargetMachine::addInstSelector(PassManagerBase &PM) {
   // Install an instruction selector.
   PM.add(createSPUISelDag(*this));
   return false;
@@ -58,7 +58,7 @@ bool SPUTargetMachine::addInstSelector(PassManagerBase &PM,
 
 // passes to run just before printing the assembly
 bool SPUTargetMachine::
-addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+addPreEmitPass(PassManagerBase &PM) {
   // load the TCE instruction scheduler, if available via
   // loaded plugins
   typedef llvm::FunctionPass* (*BuilderFunc)(const char*);
diff --git a/lib/Target/CellSPU/SPUTargetMachine.h b/lib/Target/CellSPU/SPUTargetMachine.h
index fffe77c..909f12e 100644
--- a/lib/Target/CellSPU/SPUTargetMachine.h
+++ b/lib/Target/CellSPU/SPUTargetMachine.h
@@ -40,7 +40,8 @@ class SPUTargetMachine : public LLVMTargetMachine {
 public:
   SPUTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS,
-                   Reloc::Model RM, CodeModel::Model CM);
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL);
 
   /// Return the subtarget implementation object
   virtual const SPUSubtarget     *getSubtargetImpl() const {
@@ -81,9 +82,8 @@ public:
   }
   
   // Pass Pipeline Configuration
-  virtual bool addInstSelector(PassManagerBase &PM,
-                               CodeGenOpt::Level OptLevel);
-  virtual bool addPreEmitPass(PassManagerBase &, CodeGenOpt::Level);	
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &);	
 };
 
 } // end namespace llvm
diff --git a/lib/Target/CppBackend/CPPBackend.cpp b/lib/Target/CppBackend/CPPBackend.cpp
index 394ea2b..efeb989 100644
--- a/lib/Target/CppBackend/CPPBackend.cpp
+++ b/lib/Target/CppBackend/CPPBackend.cpp
@@ -2065,7 +2065,6 @@ char CppWriter::ID = 0;
 bool CPPTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                            formatted_raw_ostream &o,
                                            CodeGenFileType FileType,
-                                           CodeGenOpt::Level OptLevel,
                                            bool DisableVerify) {
   if (FileType != TargetMachine::CGFT_AssemblyFile) return true;
   PM.add(new CppWriter(o));
diff --git a/lib/Target/CppBackend/CPPTargetMachine.h b/lib/Target/CppBackend/CPPTargetMachine.h
index 287e537..a3613b4 100644
--- a/lib/Target/CppBackend/CPPTargetMachine.h
+++ b/lib/Target/CppBackend/CPPTargetMachine.h
@@ -24,13 +24,13 @@ class formatted_raw_ostream;
 struct CPPTargetMachine : public TargetMachine {
   CPPTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS,
-                   Reloc::Model RM, CodeModel::Model CM)
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL)
     : TargetMachine(T, TT, CPU, FS) {}
 
   virtual bool addPassesToEmitFile(PassManagerBase &PM,
                                    formatted_raw_ostream &Out,
                                    CodeGenFileType FileType,
-                                   CodeGenOpt::Level OptLevel,
                                    bool DisableVerify);
 
   virtual const TargetData *getTargetData() const { return 0; }
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.cpp b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
index 7bff53e..4ad7bd6 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.cpp
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.cpp
@@ -34,8 +34,9 @@ extern "C" void LLVMInitializeMBlazeTarget() {
 MBlazeTargetMachine::
 MBlazeTargetMachine(const Target &T, StringRef TT,
                     StringRef CPU, StringRef FS,
-                    Reloc::Model RM, CodeModel::Model CM):
-  LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL):
+  LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
   Subtarget(TT, CPU, FS),
   DataLayout("E-p:32:32:32-i8:8:8-i16:16:16"),
   InstrInfo(*this),
@@ -46,8 +47,7 @@ MBlazeTargetMachine(const Target &T, StringRef TT,
 
 // Install an instruction selector pass using
 // the ISelDag to gen MBlaze code.
-bool MBlazeTargetMachine::addInstSelector(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
+bool MBlazeTargetMachine::addInstSelector(PassManagerBase &PM) {
   PM.add(createMBlazeISelDag(*this));
   return false;
 }
@@ -55,8 +55,7 @@ bool MBlazeTargetMachine::addInstSelector(PassManagerBase &PM,
 // Implemented by targets that want to run passes immediately before
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
-bool MBlazeTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                         CodeGenOpt::Level OptLevel) {
+bool MBlazeTargetMachine::addPreEmitPass(PassManagerBase &PM) {
   PM.add(createMBlazeDelaySlotFillerPass(*this));
   return true;
 }
diff --git a/lib/Target/MBlaze/MBlazeTargetMachine.h b/lib/Target/MBlaze/MBlazeTargetMachine.h
index c1bc08a..1c1aa53 100644
--- a/lib/Target/MBlaze/MBlazeTargetMachine.h
+++ b/lib/Target/MBlaze/MBlazeTargetMachine.h
@@ -43,7 +43,8 @@ namespace llvm {
   public:
     MBlazeTargetMachine(const Target &T, StringRef TT,
                         StringRef CPU, StringRef FS,
-                        Reloc::Model RM, CodeModel::Model CM);
+                        Reloc::Model RM, CodeModel::Model CM,
+                        CodeGenOpt::Level OL);
 
     virtual const MBlazeInstrInfo *getInstrInfo() const
     { return &InstrInfo; }
@@ -77,8 +78,8 @@ namespace llvm {
     }
 
     // Pass Pipeline Configuration
-    virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level Opt);
-    virtual bool addPreEmitPass(PassManagerBase &PM,CodeGenOpt::Level Opt);
+    virtual bool addInstSelector(PassManagerBase &PM);
+    virtual bool addPreEmitPass(PassManagerBase &PM);
   };
 } // End llvm namespace
 
diff --git a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
index 43ae281..a3a5cf4 100644
--- a/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
+++ b/lib/Target/MBlaze/MCTargetDesc/MBlazeMCTargetDesc.cpp
@@ -62,13 +62,14 @@ static MCAsmInfo *createMCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createMBlazeMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                                CodeModel::Model CM) {
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   if (RM == Reloc::Default)
     RM = Reloc::Static;
   if (CM == CodeModel::Default)
     CM = CodeModel::Small;
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
index fda70b8..0d532e3 100644
--- a/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
+++ b/lib/Target/MSP430/MCTargetDesc/MSP430MCTargetDesc.cpp
@@ -51,9 +51,10 @@ static MCSubtargetInfo *createMSP430MCSubtargetInfo(StringRef TT, StringRef CPU,
 }
 
 static MCCodeGenInfo *createMSP430MCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                                CodeModel::Model CM) {
+                                                CodeModel::Model CM,
+                                                CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/MSP430/MSP430ISelLowering.cpp b/lib/Target/MSP430/MSP430ISelLowering.cpp
index 9daeb2a..5c94137 100644
--- a/lib/Target/MSP430/MSP430ISelLowering.cpp
+++ b/lib/Target/MSP430/MSP430ISelLowering.cpp
@@ -29,7 +29,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CodeGen/ValueTypes.h"
diff --git a/lib/Target/MSP430/MSP430InstrInfo.cpp b/lib/Target/MSP430/MSP430InstrInfo.cpp
index ffd4318..81f766e 100644
--- a/lib/Target/MSP430/MSP430InstrInfo.cpp
+++ b/lib/Target/MSP430/MSP430InstrInfo.cpp
@@ -19,7 +19,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/Support/TargetRegistry.h"
 
@@ -43,8 +42,7 @@ void MSP430InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(
-              MachinePointerInfo(PseudoSourceValue::getFixedStack(FrameIdx)),
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
                             MachineMemOperand::MOStore,
                             MFI.getObjectSize(FrameIdx),
                             MFI.getObjectAlignment(FrameIdx));
@@ -72,8 +70,7 @@ void MSP430InstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
   MachineFrameInfo &MFI = *MF.getFrameInfo();
 
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(
-              MachinePointerInfo(PseudoSourceValue::getFixedStack(FrameIdx)),
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
                             MachineMemOperand::MOLoad,
                             MFI.getObjectSize(FrameIdx),
                             MFI.getObjectAlignment(FrameIdx));
diff --git a/lib/Target/MSP430/MSP430TargetMachine.cpp b/lib/Target/MSP430/MSP430TargetMachine.cpp
index 4dd8933..fe185fb 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.cpp
+++ b/lib/Target/MSP430/MSP430TargetMachine.cpp
@@ -28,8 +28,9 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
                                          StringRef TT,
                                          StringRef CPU,
                                          StringRef FS,
-                                         Reloc::Model RM, CodeModel::Model CM)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     // FIXME: Check TargetData string.
     DataLayout("e-p:16:16:16-i8:8:8-i16:16:16-i32:16:32-n8:16"),
@@ -37,15 +38,13 @@ MSP430TargetMachine::MSP430TargetMachine(const Target &T,
     FrameLowering(Subtarget) { }
 
 
-bool MSP430TargetMachine::addInstSelector(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel) {
+bool MSP430TargetMachine::addInstSelector(PassManagerBase &PM) {
   // Install an instruction selector.
-  PM.add(createMSP430ISelDag(*this, OptLevel));
+  PM.add(createMSP430ISelDag(*this, getOptLevel()));
   return false;
 }
 
-bool MSP430TargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                         CodeGenOpt::Level OptLevel) {
+bool MSP430TargetMachine::addPreEmitPass(PassManagerBase &PM) {
   // Must run branch selection immediately preceding the asm printer.
   PM.add(createMSP430BranchSelectionPass());
   return false;
diff --git a/lib/Target/MSP430/MSP430TargetMachine.h b/lib/Target/MSP430/MSP430TargetMachine.h
index eb483dc..4fb060f 100644
--- a/lib/Target/MSP430/MSP430TargetMachine.h
+++ b/lib/Target/MSP430/MSP430TargetMachine.h
@@ -40,7 +40,8 @@ class MSP430TargetMachine : public LLVMTargetMachine {
 public:
   MSP430TargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM);
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
 
   virtual const TargetFrameLowering *getFrameLowering() const {
     return &FrameLowering;
@@ -61,8 +62,8 @@ public:
     return &TSInfo;
   }
 
-  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &PM);
 }; // MSP430TargetMachine.
 
 } // end namespace llvm
diff --git a/lib/Target/Mips/CMakeLists.txt b/lib/Target/Mips/CMakeLists.txt
index 53656d4d..ac9cfc0 100644
--- a/lib/Target/Mips/CMakeLists.txt
+++ b/lib/Target/Mips/CMakeLists.txt
@@ -22,7 +22,6 @@ add_llvm_target(MipsCodeGen
   MipsISelLowering.cpp
   MipsFrameLowering.cpp
   MipsMCInstLower.cpp
-  MipsMCSymbolRefExpr.cpp
   MipsRegisterInfo.cpp
   MipsSubtarget.cpp
   MipsTargetMachine.cpp
diff --git a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
index 4f017d0..7bc5fe4 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsAsmBackend.cpp
@@ -58,6 +58,7 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   switch (Kind) {
   default:
     break;
+  case FK_GPRel_4:
   case FK_Data_4:
     Value &= 0xffffffff;
     break;
@@ -68,6 +69,9 @@ static unsigned adjustFixupValue(unsigned Kind, uint64_t Value) {
   case Mips::fixup_Mips_PC16:
     Value &= 0x0000ffff;
     break;
+  case Mips::fixup_Mips_HI16:
+    Value >>= 16;
+    break;
   }
 
   return Value;
@@ -104,15 +108,17 @@ public:
       llvm_unreachable("Unknown fixup kind!");
     case Mips::fixup_Mips_GOT16: // This will be fixed up at link time
      break;
+    case FK_GPRel_4:
     case FK_Data_4:
     case Mips::fixup_Mips_26:
     case Mips::fixup_Mips_LO16:
     case Mips::fixup_Mips_PC16:
+    case Mips::fixup_Mips_HI16:
       // For each byte of the fragment that the fixup touches, mask i
       // the fixup value. The Value has been "split up" into the appr
       // bitfields above.
       for (unsigned i = 0; i != 4; ++i) // FIXME - Need to support 2 and 8 bytes
-        Data[Offset + i] |= uint8_t((Value >> (i * 8)) & 0xff);
+        Data[Offset + i] += uint8_t((Value >> (i * 8)) & 0xff);
       break;
     }
   }
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
index 1115fec..0c3cbb3 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCCodeEmitter.cpp
@@ -173,11 +173,21 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
   } else if (MO.isExpr()) {
     const MCExpr *Expr = MO.getExpr();
     MCExpr::ExprKind Kind = Expr->getKind();
+    unsigned Ret = 0;
+
+    if (Kind == MCExpr::Binary) {
+      const MCBinaryExpr *BE = static_cast<const MCBinaryExpr*>(Expr);
+      Expr = BE->getLHS();
+      Kind = Expr->getKind();
+      const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(BE->getRHS());
+      assert((Kind == MCExpr::SymbolRef) && CE &&
+             "Binary expression must be sym+const.");
+      Ret = CE->getValue();
+    }
+
     if (Kind == MCExpr::SymbolRef) {
-      Mips::Fixups FixupKind = Mips::fixup_Mips_NONE;
-      MCSymbolRefExpr::VariantKind SymRefKind =
-          cast<MCSymbolRefExpr>(Expr)->getKind();
-      switch(SymRefKind) {
+      Mips::Fixups FixupKind;
+      switch(cast<MCSymbolRefExpr>(Expr)->getKind()) {
       case MCSymbolRefExpr::VK_Mips_GPREL:
         FixupKind = Mips::fixup_Mips_GPREL16;
         break;
@@ -206,12 +216,12 @@ getMachineOpValue(const MCInst &MI, const MCOperand &MO,
         FixupKind = Mips::fixup_Mips_TPREL_LO;
         break;
       default:
-        return 0;
+        return Ret;
       } // switch
       Fixups.push_back(MCFixup::Create(0, Expr, MCFixupKind(FixupKind)));
     } // if SymbolRef
     // All of the information is in the fixup.
-    return 0;
+    return Ret;
   }
   llvm_unreachable("Unable to encode MCOperand!");
   // Not reached
@@ -234,15 +244,22 @@ MipsMCCodeEmitter::getMemEncoding(const MCInst &MI, unsigned OpNo,
 unsigned
 MipsMCCodeEmitter::getSizeExtEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  // FIXME: implement
-  return 0;
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned szEncoding = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+  return szEncoding - 1;
 }
 
+// FIXME: should be called getMSBEncoding
+//
 unsigned
 MipsMCCodeEmitter::getSizeInsEncoding(const MCInst &MI, unsigned OpNo,
                                       SmallVectorImpl<MCFixup> &Fixups) const {
-  // FIXME: implement
-  return 0;
+  assert(MI.getOperand(OpNo-1).isImm());
+  assert(MI.getOperand(OpNo).isImm());
+  unsigned pos = getMachineOpValue(MI, MI.getOperand(OpNo-1), Fixups);
+  unsigned sz = getMachineOpValue(MI, MI.getOperand(OpNo), Fixups);
+
+  return pos + sz - 1;
 }
 
 #include "MipsGenMCCodeEmitter.inc"
diff --git a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
index e6040e4..1fec88a 100644
--- a/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
+++ b/lib/Target/Mips/MCTargetDesc/MipsMCTargetDesc.cpp
@@ -63,11 +63,12 @@ static MCAsmInfo *createMipsMCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createMipsMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                              CodeModel::Model CM) {
+                                              CodeModel::Model CM,
+                                              CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
   if (RM == Reloc::Default)
     RM = Reloc::PIC_;
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/Mips/Mips64InstrInfo.td b/lib/Target/Mips/Mips64InstrInfo.td
index 3c97241..b0fb4fa 100644
--- a/lib/Target/Mips/Mips64InstrInfo.td
+++ b/lib/Target/Mips/Mips64InstrInfo.td
@@ -175,6 +175,7 @@ def SCD    : SCBase<0x3c, "scd", CPU64Regs, mem>, Requires<[NotN64]>;
 def SCD_P8 : SCBase<0x3c, "scd", CPU64Regs, mem64>, Requires<[IsN64]>;
 
 /// Jump and Branch Instructions
+def JR64   : JumpFR<0x00, 0x08, "jr", CPU64Regs>;
 def JAL64  : JumpLink64<0x03, "jal">;
 def JALR64 : JumpLinkReg64<0x00, 0x09, "jalr">;
 def BEQ64  : CBranch<0x04, "beq", seteq, CPU64Regs>;
@@ -231,7 +232,24 @@ let Predicates = [IsN64] in {
 }
 
 // hi/lo relocs
-def : Pat<(i64 (MipsLo tglobaladdr:$in)), (DADDiu ZERO_64, tglobaladdr:$in)>;
+def : Pat<(MipsHi tglobaladdr:$in), (LUi64 tglobaladdr:$in)>;
+def : Pat<(MipsHi tblockaddress:$in), (LUi64 tblockaddress:$in)>;
+def : Pat<(MipsHi tjumptable:$in), (LUi64 tjumptable:$in)>;
+def : Pat<(MipsHi tconstpool:$in), (LUi64 tconstpool:$in)>;
+
+def : Pat<(MipsLo tglobaladdr:$in), (DADDiu ZERO_64, tglobaladdr:$in)>;
+def : Pat<(MipsLo tblockaddress:$in), (DADDiu ZERO_64, tblockaddress:$in)>;
+def : Pat<(MipsLo tjumptable:$in), (DADDiu ZERO_64, tjumptable:$in)>;
+def : Pat<(MipsLo tconstpool:$in), (DADDiu ZERO_64, tconstpool:$in)>;
+
+def : Pat<(add CPU64Regs:$hi, (MipsLo tglobaladdr:$lo)),
+          (DADDiu CPU64Regs:$hi, tglobaladdr:$lo)>;
+def : Pat<(add CPU64Regs:$hi, (MipsLo tblockaddress:$lo)),
+          (DADDiu CPU64Regs:$hi, tblockaddress:$lo)>;
+def : Pat<(add CPU64Regs:$hi, (MipsLo tjumptable:$lo)),
+          (DADDiu CPU64Regs:$hi, tjumptable:$lo)>;
+def : Pat<(add CPU64Regs:$hi, (MipsLo tconstpool:$lo)),
+          (DADDiu CPU64Regs:$hi, tconstpool:$lo)>;
 
 defm : BrcondPats<CPU64Regs, BEQ64, BNE64, SLT64, SLTu64, SLTi64, SLTiu64,
                   ZERO_64>;
diff --git a/lib/Target/Mips/MipsAsmPrinter.cpp b/lib/Target/Mips/MipsAsmPrinter.cpp
index 186a5e3..d27e3ab 100644
--- a/lib/Target/Mips/MipsAsmPrinter.cpp
+++ b/lib/Target/Mips/MipsAsmPrinter.cpp
@@ -18,7 +18,6 @@
 #include "MipsInstrInfo.h"
 #include "MipsMachineFunction.h"
 #include "MipsMCInstLower.h"
-#include "MipsMCSymbolRefExpr.h"
 #include "InstPrinter/MipsInstPrinter.h"
 #include "MCTargetDesc/MipsBaseInfo.h"
 #include "llvm/ADT/SmallString.h"
@@ -79,12 +78,19 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
 
   // Enclose unaligned load or store with .macro & .nomacro directives.
   if (isUnalignedLoadStore(Opc)) {
-    MCInst Directive;
-    Directive.setOpcode(Mips::MACRO);
-    OutStreamer.EmitInstruction(Directive);
-    OutStreamer.EmitInstruction(TmpInst0);
-    Directive.setOpcode(Mips::NOMACRO);
-    OutStreamer.EmitInstruction(Directive);
+    if (OutStreamer.hasRawTextSupport()) {
+      MCInst Directive;
+      Directive.setOpcode(Mips::MACRO);
+      OutStreamer.EmitInstruction(Directive);
+      OutStreamer.EmitInstruction(TmpInst0);
+      Directive.setOpcode(Mips::NOMACRO);
+      OutStreamer.EmitInstruction(Directive);
+    } else {
+      MCInstLowering.LowerUnalignedLoadStore(MI, MCInsts);
+      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
+          != MCInsts.end(); ++I)
+        OutStreamer.EmitInstruction(*I);
+    }
     return;
   }
 
@@ -92,8 +98,8 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
     // Lower CPLOAD and CPRESTORE
     if (Opc == Mips::CPLOAD) {
       MCInstLowering.LowerCPLOAD(MI, MCInsts);
-      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin();
-           I != MCInsts.end(); ++I)
+      for (SmallVector<MCInst, 4>::iterator I = MCInsts.begin(); I
+          != MCInsts.end(); ++I)
         OutStreamer.EmitInstruction(*I);
       return;
     }
@@ -102,7 +108,7 @@ void MipsAsmPrinter::EmitInstruction(const MachineInstr *MI) {
       MCInstLowering.LowerCPRESTORE(MI, TmpInst0);
       OutStreamer.EmitInstruction(TmpInst0);
       return;
-    } 
+    }
   }
 
   OutStreamer.EmitInstruction(TmpInst0);
diff --git a/lib/Target/Mips/MipsAsmPrinter.h b/lib/Target/Mips/MipsAsmPrinter.h
index 16461ff..f0c6626 100644
--- a/lib/Target/Mips/MipsAsmPrinter.h
+++ b/lib/Target/Mips/MipsAsmPrinter.h
@@ -27,9 +27,11 @@ class MachineBasicBlock;
 class Module;
 
 class LLVM_LIBRARY_VISIBILITY MipsAsmPrinter : public AsmPrinter {
-  const MipsSubtarget *Subtarget;
-  
+
 public:
+
+  const MipsSubtarget *Subtarget;
+
   explicit MipsAsmPrinter(TargetMachine &TM,  MCStreamer &Streamer)
     : AsmPrinter(TM, Streamer) {
     Subtarget = &TM.getSubtarget<MipsSubtarget>();
diff --git a/lib/Target/Mips/MipsFrameLowering.cpp b/lib/Target/Mips/MipsFrameLowering.cpp
index 19bb1a5..36aef99 100644
--- a/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/lib/Target/Mips/MipsFrameLowering.cpp
@@ -152,6 +152,9 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   bool ATUsed;
   unsigned GP = STI.isABI_N64() ? Mips::GP_64 : Mips::GP;
   unsigned T9 = STI.isABI_N64() ? Mips::T9_64 : Mips::T9;
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
   unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
   unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
   unsigned LUi = STI.isABI_N64() ? Mips::LUi64 : Mips::LUi;
@@ -169,13 +172,14 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   MFI->setStackSize(StackSize); 
   
   BuildMI(MBB, MBBI, dl, TII.get(Mips::NOREORDER));
+  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
 
   // Emit instructions that set $gp using the the value of $t9.
   // O32 uses the directive .cpload while N32/64 requires three instructions to
   // do this.  
   // TODO: Do not emit these instructions if no instructions use $gp.
   if (isPIC && STI.isABI_O32())
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::CPLOAD))
+    BuildMI(MBB, llvm::prior(MBBI), dl, TII.get(Mips::CPLOAD))
       .addReg(RegInfo->getPICCallReg());
   else if (STI.isABI_N64() || (isPIC && STI.isABI_N32())) {
     //  lui     $28,%hi(%neg(%gp_rel(fname)))
@@ -189,8 +193,6 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
       .addGlobalAddress(FName, 0, MipsII::MO_GPOFF_LO);
   }
 
-  BuildMI(MBB, MBBI, dl, TII.get(Mips::NOMACRO));
-
   // No need to allocate space on the stack.
   if (StackSize == 0 && !MFI->adjustsStack()) return;
 
@@ -199,10 +201,8 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   MachineLocation DstML, SrcML;
 
   // Adjust stack : addi sp, sp, (-imm)
-  ATUsed = expandRegLargeImmPair(Mips::SP, -StackSize, NewReg, NewImm, MBB,
-                                 MBBI);
-  BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
-    .addReg(NewReg).addImm(NewImm);
+  ATUsed = expandRegLargeImmPair(SP, -StackSize, NewReg, NewImm, MBB, MBBI);
+  BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(NewReg).addImm(NewImm);
 
   // FIXME: change this when mips goes MC".
   if (ATUsed)
@@ -262,14 +262,13 @@ void MipsFrameLowering::emitPrologue(MachineFunction &MF) const {
   // if framepointer enabled, set it to point to the stack pointer.
   if (hasFP(MF)) {
     // Insert instruction "move $fp, $sp" at this location.    
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDu), Mips::FP)
-      .addReg(Mips::SP).addReg(Mips::ZERO);
+    BuildMI(MBB, MBBI, dl, TII.get(ADDu), FP).addReg(SP).addReg(ZERO);
 
     // emit ".cfi_def_cfa_register $fp" 
     MCSymbol *SetFPLabel = MMI.getContext().CreateTempSymbol();
     BuildMI(MBB, MBBI, dl,
             TII.get(TargetOpcode::PROLOG_LABEL)).addSym(SetFPLabel);
-    DstML = MachineLocation(Mips::FP);
+    DstML = MachineLocation(FP);
     SrcML = MachineLocation(MachineLocation::VirtualFP);
     Moves.push_back(MachineMove(SetFPLabel, DstML, SrcML));
   }
@@ -293,6 +292,11 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
   const MipsInstrInfo &TII =
     *static_cast<const MipsInstrInfo*>(MF.getTarget().getInstrInfo());
   DebugLoc dl = MBBI->getDebugLoc();
+  unsigned SP = STI.isABI_N64() ? Mips::SP_64 : Mips::SP;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
+  unsigned ZERO = STI.isABI_N64() ? Mips::ZERO_64 : Mips::ZERO;
+  unsigned ADDu = STI.isABI_N64() ? Mips::DADDu : Mips::ADDu;
+  unsigned ADDiu = STI.isABI_N64() ? Mips::DADDiu : Mips::ADDiu;
 
   // Get the number of bytes from FrameInfo
   unsigned StackSize = MFI->getStackSize();
@@ -310,16 +314,13 @@ void MipsFrameLowering::emitEpilogue(MachineFunction &MF,
       --I;
 
     // Insert instruction "move $sp, $fp" at this location.
-    BuildMI(MBB, I, dl, TII.get(Mips::ADDu), Mips::SP)
-      .addReg(Mips::FP).addReg(Mips::ZERO);
+    BuildMI(MBB, I, dl, TII.get(ADDu), SP).addReg(FP).addReg(ZERO);
   }
 
   // adjust stack  : insert addi sp, sp, (imm)
   if (StackSize) {
-    ATUsed = expandRegLargeImmPair(Mips::SP, StackSize, NewReg, NewImm, MBB,
-                                   MBBI);
-    BuildMI(MBB, MBBI, dl, TII.get(Mips::ADDiu), Mips::SP)
-      .addReg(NewReg).addImm(NewImm);
+    ATUsed = expandRegLargeImmPair(SP, StackSize, NewReg, NewImm, MBB, MBBI);
+    BuildMI(MBB, MBBI, dl, TII.get(ADDiu), SP).addReg(NewReg).addImm(NewImm);
 
     // FIXME: change this when mips goes MC".
     if (ATUsed)
@@ -331,13 +332,15 @@ void MipsFrameLowering::
 processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
                                      RegScavenger *RS) const {
   MachineRegisterInfo& MRI = MF.getRegInfo();
+  unsigned RA = STI.isABI_N64() ? Mips::RA_64 : Mips::RA;
+  unsigned FP = STI.isABI_N64() ? Mips::FP_64 : Mips::FP;
 
   // FIXME: remove this code if register allocator can correctly mark
   //        $fp and $ra used or unused.
 
   // Mark $fp and $ra as used or unused.
   if (hasFP(MF))
-    MRI.setPhysRegUsed(Mips::FP);
+    MRI.setPhysRegUsed(FP);
 
   // The register allocator might determine $ra is used after seeing 
   // instruction "jr $ra", but we do not want PrologEpilogInserter to insert
@@ -345,7 +348,7 @@ processFunctionBeforeCalleeSavedScan(MachineFunction &MF,
   // To correct this, $ra is explicitly marked unused if there is no
   // function call.
   if (MF.getFrameInfo()->hasCalls())
-    MRI.setPhysRegUsed(Mips::RA);
+    MRI.setPhysRegUsed(RA);
   else
-    MRI.setPhysRegUnused(Mips::RA);
+    MRI.setPhysRegUnused(RA);
 }
diff --git a/lib/Target/Mips/MipsISelLowering.cpp b/lib/Target/Mips/MipsISelLowering.cpp
index b595f03..b5a15cf 100644
--- a/lib/Target/Mips/MipsISelLowering.cpp
+++ b/lib/Target/Mips/MipsISelLowering.cpp
@@ -127,9 +127,11 @@ MipsTargetLowering(MipsTargetMachine &TM)
   setOperationAction(ISD::GlobalAddress,      MVT::i32,   Custom);
   setOperationAction(ISD::GlobalAddress,      MVT::i64,   Custom);
   setOperationAction(ISD::BlockAddress,       MVT::i32,   Custom);
+  setOperationAction(ISD::BlockAddress,       MVT::i64,   Custom);
   setOperationAction(ISD::GlobalTLSAddress,   MVT::i32,   Custom);
   setOperationAction(ISD::JumpTable,          MVT::i32,   Custom);
   setOperationAction(ISD::ConstantPool,       MVT::i32,   Custom);
+  setOperationAction(ISD::ConstantPool,       MVT::i64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f32,   Custom);
   setOperationAction(ISD::SELECT,             MVT::f64,   Custom);
   setOperationAction(ISD::SELECT,             MVT::i32,   Custom);
@@ -1506,7 +1508,7 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
   // FIXME there isn't actually debug info here
   DebugLoc dl = Op.getDebugLoc();
 
-  if (getTargetMachine().getRelocationModel() != Reloc::PIC_) {
+  if (getTargetMachine().getRelocationModel() != Reloc::PIC_ && !IsN64) {
     // %hi/%lo relocation
     SDValue BAHi = DAG.getBlockAddress(BA, MVT::i32, true,
                                        MipsII::MO_ABS_HI);
@@ -1517,16 +1519,17 @@ SDValue MipsTargetLowering::LowerBlockAddress(SDValue Op,
     return DAG.getNode(ISD::ADD, dl, MVT::i32, Hi, Lo);
   }
 
-  SDValue BAGOTOffset = DAG.getBlockAddress(BA, MVT::i32, true,
-                                            MipsII::MO_GOT);
-  BAGOTOffset = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, BAGOTOffset);
-  SDValue BALOOffset = DAG.getBlockAddress(BA, MVT::i32, true,
-                                           MipsII::MO_ABS_LO);
-  SDValue Load = DAG.getLoad(MVT::i32, dl,
+  EVT ValTy = Op.getValueType();
+  unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+  unsigned OFSTFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+  SDValue BAGOTOffset = DAG.getBlockAddress(BA, ValTy, true, GOTFlag);
+  BAGOTOffset = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, BAGOTOffset);
+  SDValue BALOOffset = DAG.getBlockAddress(BA, ValTy, true, OFSTFlag);
+  SDValue Load = DAG.getLoad(ValTy, dl,
                              DAG.getEntryNode(), BAGOTOffset,
                              MachinePointerInfo(), false, false, false, 0);
-  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, BALOOffset);
-  return DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
+  SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, BALOOffset);
+  return DAG.getNode(ISD::ADD, dl, ValTy, Load, Lo);
 }
 
 SDValue MipsTargetLowering::
@@ -1649,16 +1652,19 @@ LowerConstantPool(SDValue Op, SelectionDAG &DAG) const
     SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CPLo);
     ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, HiPart, Lo);
   } else {
-    SDValue CP = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
-                                           N->getOffset(), MipsII::MO_GOT);
-    CP = DAG.getNode(MipsISD::WrapperPIC, dl, MVT::i32, CP);
-    SDValue Load = DAG.getLoad(MVT::i32, dl, DAG.getEntryNode(),
+    EVT ValTy = Op.getValueType();
+    unsigned GOTFlag = IsN64 ? MipsII::MO_GOT_PAGE : MipsII::MO_GOT;
+    unsigned OFSTFlag = IsN64 ? MipsII::MO_GOT_OFST : MipsII::MO_ABS_LO;
+    SDValue CP = DAG.getTargetConstantPool(C, ValTy, N->getAlignment(),
+                                           N->getOffset(), GOTFlag);
+    CP = DAG.getNode(MipsISD::WrapperPIC, dl, ValTy, CP);
+    SDValue Load = DAG.getLoad(ValTy, dl, DAG.getEntryNode(),
                                CP, MachinePointerInfo::getConstantPool(),
                                false, false, false, 0);
-    SDValue CPLo = DAG.getTargetConstantPool(C, MVT::i32, N->getAlignment(),
-                                             N->getOffset(), MipsII::MO_ABS_LO);
-    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, MVT::i32, CPLo);
-    ResNode = DAG.getNode(ISD::ADD, dl, MVT::i32, Load, Lo);
+    SDValue CPLo = DAG.getTargetConstantPool(C, ValTy, N->getAlignment(),
+                                             N->getOffset(), OFSTFlag);
+    SDValue Lo = DAG.getNode(MipsISD::Lo, dl, ValTy, CPLo);
+    ResNode = DAG.getNode(ISD::ADD, dl, ValTy, Load, Lo);
   }
 
   return ResNode;
@@ -2063,6 +2069,7 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
   bool IsRegLoc = VA.isRegLoc();
   unsigned Offset = 0; // Offset in # of bytes from the beginning of struct.
   unsigned LocMemOffset = 0;
+  unsigned MemCpySize = ByValSize;
 
   if (!IsRegLoc)
     LocMemOffset = VA.getLocMemOffset();
@@ -2082,9 +2089,13 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
       RegsToPass.push_back(std::make_pair(*Reg, LoadVal));
     }
 
+    // Return if the struct has been fully copied. 
+    if (!(MemCpySize = ByValSize - Offset))
+      return;
+
     // If there is an argument register available, copy the remainder of the
     // byval argument with sub-doubleword loads and shifts.
-    if ((Reg != RegEnd) && (ByValSize != Offset)) {
+    if (Reg != RegEnd) {
       assert((ByValSize < Offset + 8) &&
              "Size of the remainder should be smaller than 8-byte.");
       SDValue Val;
@@ -2119,19 +2130,18 @@ PassByValArg64(SDValue& ByValChain, SDValue Chain, DebugLoc dl,
     }
   }
 
-  unsigned MemCpySize = ByValSize - Offset;
-  if (MemCpySize) {
-    // Create a fixed object on stack at offset LocMemOffset and copy
-    // remainder of byval arg to it with memcpy.
-    SDValue Src = DAG.getNode(ISD::ADD, dl, PtrTy, Arg,
-                              DAG.getConstant(Offset, PtrTy));
-    LastFI = MFI->CreateFixedObject(MemCpySize, LocMemOffset, true);
-    SDValue Dst = DAG.getFrameIndex(LastFI, PtrTy);
-    ByValChain = DAG.getMemcpy(ByValChain, dl, Dst, Src,
-                               DAG.getConstant(MemCpySize, PtrTy), Alignment,
-                               /*isVolatile=*/false, /*AlwaysInline=*/false,
-                               MachinePointerInfo(0), MachinePointerInfo(0));
-  }
+  assert(MemCpySize && "MemCpySize must not be zero.");
+
+  // Create a fixed object on stack at offset LocMemOffset and copy
+  // remainder of byval arg to it with memcpy.
+  SDValue Src = DAG.getNode(ISD::ADD, dl, PtrTy, Arg,
+                            DAG.getConstant(Offset, PtrTy));
+  LastFI = MFI->CreateFixedObject(MemCpySize, LocMemOffset, true);
+  SDValue Dst = DAG.getFrameIndex(LastFI, PtrTy);
+  ByValChain = DAG.getMemcpy(ByValChain, dl, Dst, Src,
+                             DAG.getConstant(MemCpySize, PtrTy), Alignment,
+                             /*isVolatile=*/false, /*AlwaysInline=*/false,
+                             MachinePointerInfo(0), MachinePointerInfo(0));
 }
 
 /// LowerCall - functions arguments are copied from virtual regs to
diff --git a/lib/Target/Mips/MipsInstrInfo.td b/lib/Target/Mips/MipsInstrInfo.td
index 5dca9b6..0ae94ab 100644
--- a/lib/Target/Mips/MipsInstrInfo.td
+++ b/lib/Target/Mips/MipsInstrInfo.td
@@ -145,7 +145,9 @@ def brtarget    : Operand<OtherVT> {
   let EncoderMethod = "getBranchTargetOpValue";
   let OperandType = "OPERAND_PCREL";
 }
-def calltarget  : Operand<i32>;
+def calltarget  : Operand<iPTR> {
+  let EncoderMethod = "getJumpTargetOpValue";
+}
 def calltarget64: Operand<i64>;
 def simm16      : Operand<i32>;
 def simm16_64   : Operand<i64>;
@@ -378,6 +380,22 @@ class StoreM<bits<6> op, string instr_asm, PatFrag OpNode, RegisterClass RC,
   let isPseudo = Pseudo;
 }
 
+// Memory Load/Store
+let canFoldAsLoad = 1 in
+class LoadX<bits<6> op, RegisterClass RC,
+            Operand MemOpnd>:
+  FMem<op, (outs RC:$rt), (ins MemOpnd:$addr),
+     "",
+     [], IILoad> {
+}
+
+class StoreX<bits<6> op, RegisterClass RC,
+             Operand MemOpnd>:
+  FMem<op, (outs), (ins RC:$rt, MemOpnd:$addr),
+     "",
+     [], IIStore> {
+}
+
 // 32-bit load.
 multiclass LoadM32<bits<6> op, string instr_asm, PatFrag OpNode,
                    bit Pseudo = 0> {
@@ -396,6 +414,13 @@ multiclass LoadM64<bits<6> op, string instr_asm, PatFrag OpNode,
                Requires<[IsN64]>;
 } 
 
+// 32-bit load.
+multiclass LoadX32<bits<6> op> {
+  def #NAME# : LoadX<op, CPURegs, mem>,
+               Requires<[NotN64]>;
+  def _P8    : LoadX<op, CPURegs, mem64>,
+               Requires<[IsN64]>;
+}
 // 32-bit store.
 multiclass StoreM32<bits<6> op, string instr_asm, PatFrag OpNode,
                     bit Pseudo = 0> {
@@ -414,6 +439,14 @@ multiclass StoreM64<bits<6> op, string instr_asm, PatFrag OpNode,
                Requires<[IsN64]>;
 }
 
+// 32-bit store.
+multiclass StoreX32<bits<6> op> {
+  def #NAME# : StoreX<op, CPURegs, mem>,
+               Requires<[NotN64]>;
+  def _P8    : StoreX<op, CPURegs, mem64>,
+               Requires<[IsN64]>;
+}
+
 // Conditional Branch
 class CBranch<bits<6> op, string instr_asm, PatFrag cond_op, RegisterClass RC>:
   CBranchBase<op, (outs), (ins RC:$rs, RC:$rt, brtarget:$imm16),
@@ -458,10 +491,11 @@ class JumpFJ<bits<6> op, string instr_asm>:
   FJ<op, (outs), (ins jmptarget:$target),
      !strconcat(instr_asm, "\t$target"), [(br bb:$target)], IIBranch>;
 
-let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1 in
-class JumpFR<bits<6> op, bits<6> func, string instr_asm>:
-  FR<op, func, (outs), (ins CPURegs:$rs),
-     !strconcat(instr_asm, "\t$rs"), [(brind CPURegs:$rs)], IIBranch> {
+let isBranch=1, isTerminator=1, isBarrier=1, rd=0, hasDelaySlot = 1,
+    isIndirectBranch = 1 in
+class JumpFR<bits<6> op, bits<6> func, string instr_asm, RegisterClass RC>:
+  FR<op, func, (outs), (ins RC:$rs),
+     !strconcat(instr_asm, "\t$rs"), [(brind RC:$rs)], IIBranch> {
   let rt = 0;
   let rd = 0;
   let shamt = 0;
@@ -760,6 +794,12 @@ defm ULW     : LoadM32<0x23, "ulw",  load_u, 1>;
 defm USH     : StoreM32<0x29, "ush", truncstorei16_u, 1>;
 defm USW     : StoreM32<0x2b, "usw", store_u, 1>;
 
+/// Primitives for unaligned
+defm LWL     : LoadX32<0x22>;
+defm LWR     : LoadX32<0x26>;
+defm SWL     : StoreX32<0x2A>;
+defm SWR     : StoreX32<0x2E>;
+
 let hasSideEffects = 1 in
 def SYNC : MipsInst<(outs), (ins i32imm:$stype), "sync $stype",
                     [(MipsSync imm:$stype)], NoItinerary, FrmOther>
@@ -779,8 +819,7 @@ def SC_P8 : SCBase<0x38, "sc", CPURegs, mem64>, Requires<[IsN64]>;
 
 /// Jump and Branch Instructions
 def J       : JumpFJ<0x02, "j">;
-let isIndirectBranch = 1 in
-  def JR      : JumpFR<0x00, 0x08, "jr">;
+def JR      : JumpFR<0x00, 0x08, "jr", CPURegs>;
 def JAL     : JumpLink<0x03, "jal">;
 def JALR    : JumpLinkReg<0x00, 0x09, "jalr">;
 def BEQ     : CBranch<0x04, "beq", seteq, CPURegs>;
@@ -898,20 +937,20 @@ def : Pat<(MipsJmpLink (i32 texternalsym:$dst)),
 // hi/lo relocs
 def : Pat<(MipsHi tglobaladdr:$in), (LUi tglobaladdr:$in)>;
 def : Pat<(MipsHi tblockaddress:$in), (LUi tblockaddress:$in)>;
+def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
+def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
+
 def : Pat<(MipsLo tglobaladdr:$in), (ADDiu ZERO, tglobaladdr:$in)>;
 def : Pat<(MipsLo tblockaddress:$in), (ADDiu ZERO, tblockaddress:$in)>;
+def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
+def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
+
 def : Pat<(add CPURegs:$hi, (MipsLo tglobaladdr:$lo)),
           (ADDiu CPURegs:$hi, tglobaladdr:$lo)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tblockaddress:$lo)),
           (ADDiu CPURegs:$hi, tblockaddress:$lo)>;
-
-def : Pat<(MipsHi tjumptable:$in), (LUi tjumptable:$in)>;
-def : Pat<(MipsLo tjumptable:$in), (ADDiu ZERO, tjumptable:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tjumptable:$lo)),
           (ADDiu CPURegs:$hi, tjumptable:$lo)>;
-
-def : Pat<(MipsHi tconstpool:$in), (LUi tconstpool:$in)>;
-def : Pat<(MipsLo tconstpool:$in), (ADDiu ZERO, tconstpool:$in)>;
 def : Pat<(add CPURegs:$hi, (MipsLo tconstpool:$lo)),
           (ADDiu CPURegs:$hi, tconstpool:$lo)>;
 
diff --git a/lib/Target/Mips/MipsMCInstLower.cpp b/lib/Target/Mips/MipsMCInstLower.cpp
index 1fab52c..6fc2af1 100644
--- a/lib/Target/Mips/MipsMCInstLower.cpp
+++ b/lib/Target/Mips/MipsMCInstLower.cpp
@@ -23,6 +23,7 @@
 #include "llvm/MC/MCExpr.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Target/Mangler.h"
+
 using namespace llvm;
 
 MipsMCInstLower::MipsMCInstLower(Mangler *mang, const MachineFunction &mf,
@@ -55,34 +56,34 @@ MCOperand MipsMCInstLower::LowerSymbolOperand(const MachineOperand &MO,
   }
 
   switch (MOTy) {
-    case MachineOperand::MO_MachineBasicBlock:
-      Symbol = MO.getMBB()->getSymbol();
-      break;
-
-    case MachineOperand::MO_GlobalAddress:
-      Symbol = Mang->getSymbol(MO.getGlobal());
-      break;
-
-    case MachineOperand::MO_BlockAddress:
-      Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
-      break;
-
-    case MachineOperand::MO_ExternalSymbol:
-      Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
-      break;
-
-    case MachineOperand::MO_JumpTableIndex:
-      Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
-      break;
-
-    case MachineOperand::MO_ConstantPoolIndex:
-      Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
-      if (MO.getOffset())
-        Offset += MO.getOffset();  
-      break;
-
-    default:
-      llvm_unreachable("<unknown operand type>");
+  case MachineOperand::MO_MachineBasicBlock:
+    Symbol = MO.getMBB()->getSymbol();
+    break;
+
+  case MachineOperand::MO_GlobalAddress:
+    Symbol = Mang->getSymbol(MO.getGlobal());
+    break;
+
+  case MachineOperand::MO_BlockAddress:
+    Symbol = AsmPrinter.GetBlockAddressSymbol(MO.getBlockAddress());
+    break;
+
+  case MachineOperand::MO_ExternalSymbol:
+    Symbol = AsmPrinter.GetExternalSymbolSymbol(MO.getSymbolName());
+    break;
+
+  case MachineOperand::MO_JumpTableIndex:
+    Symbol = AsmPrinter.GetJTISymbol(MO.getIndex());
+    break;
+
+  case MachineOperand::MO_ConstantPoolIndex:
+    Symbol = AsmPrinter.GetCPISymbol(MO.getIndex());
+    if (MO.getOffset())
+      Offset += MO.getOffset();
+    break;
+
+  default:
+    llvm_unreachable("<unknown operand type>");
   }
   
   const MCSymbolRefExpr *MCSym = MCSymbolRefExpr::Create(Symbol, Kind, Ctx);
@@ -145,8 +146,8 @@ void MipsMCInstLower::LowerCPRESTORE(const MachineInstr *MI, MCInst &OutMI) {
   OutMI.addOperand(MCOperand::CreateImm(MO.getImm()));
 }
 
-
-MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO) const {
+MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO,
+		                                    unsigned offset) const {
   MachineOperandType MOTy = MO.getType();
   
   switch (MOTy) {
@@ -158,14 +159,14 @@ MCOperand MipsMCInstLower::LowerOperand(const MachineOperand& MO) const {
     if (MO.isImplicit()) break;
     return MCOperand::CreateReg(MO.getReg());
   case MachineOperand::MO_Immediate:
-    return MCOperand::CreateImm(MO.getImm());
+    return MCOperand::CreateImm(MO.getImm() + offset);
   case MachineOperand::MO_MachineBasicBlock:
   case MachineOperand::MO_GlobalAddress:
   case MachineOperand::MO_ExternalSymbol:
   case MachineOperand::MO_JumpTableIndex:
   case MachineOperand::MO_ConstantPoolIndex:
   case MachineOperand::MO_BlockAddress:
-    return LowerSymbolOperand(MO, MOTy, 0);
+    return LowerSymbolOperand(MO, MOTy, offset);
  }
 
   return MCOperand();
@@ -182,3 +183,116 @@ void MipsMCInstLower::Lower(const MachineInstr *MI, MCInst &OutMI) const {
       OutMI.addOperand(MCOp);
   }
 }
+
+void MipsMCInstLower::LowerUnalignedLoadStore(const MachineInstr *MI,
+		                                          SmallVector<MCInst,
+		                                          4>& MCInsts) {
+  unsigned Opc = MI->getOpcode();
+  MCInst instr1, instr2, instr3, move;
+
+  bool two_instructions = false;
+
+  assert(MI->getNumOperands() == 3);
+  assert(MI->getOperand(0).isReg());
+  assert(MI->getOperand(1).isReg());
+
+  MCOperand target = LowerOperand(MI->getOperand(0));
+  MCOperand base = LowerOperand(MI->getOperand(1));
+  MCOperand atReg = MCOperand::CreateReg(Mips::AT);
+  MCOperand zeroReg = MCOperand::CreateReg(Mips::ZERO);
+
+  MachineOperand unloweredName = MI->getOperand(2);
+  MCOperand name = LowerOperand(unloweredName);
+
+  move.setOpcode(Mips::ADDu);
+  move.addOperand(target);
+  move.addOperand(atReg);
+  move.addOperand(zeroReg);
+
+  switch (Opc) {
+  case Mips::ULW: {
+    // FIXME: only works for little endian right now
+    MCOperand adj_name = LowerOperand(unloweredName, 3);
+    if (base.getReg() == (target.getReg())) {
+      instr1.setOpcode(Mips::LWL);
+      instr1.addOperand(atReg);
+      instr1.addOperand(base);
+      instr1.addOperand(adj_name);
+      instr2.setOpcode(Mips::LWR);
+      instr2.addOperand(atReg);
+      instr2.addOperand(base);
+      instr2.addOperand(name);
+      instr3 = move;
+    } else {
+      two_instructions = true;
+      instr1.setOpcode(Mips::LWL);
+      instr1.addOperand(target);
+      instr1.addOperand(base);
+      instr1.addOperand(adj_name);
+      instr2.setOpcode(Mips::LWR);
+      instr2.addOperand(target);
+      instr2.addOperand(base);
+      instr2.addOperand(name);
+    }
+    break;
+  }
+  case Mips::ULHu: {
+    // FIXME: only works for little endian right now
+    MCOperand adj_name = LowerOperand(unloweredName, 1);
+    instr1.setOpcode(Mips::LBu);
+    instr1.addOperand(atReg);
+    instr1.addOperand(base);
+    instr1.addOperand(adj_name);
+    instr2.setOpcode(Mips::LBu);
+    instr2.addOperand(target);
+    instr2.addOperand(base);
+    instr2.addOperand(name);
+    instr3.setOpcode(Mips::INS);
+    instr3.addOperand(target);
+    instr3.addOperand(atReg);
+    instr3.addOperand(MCOperand::CreateImm(0x8));
+    instr3.addOperand(MCOperand::CreateImm(0x18));
+    break;
+  }
+
+  case Mips::USW: {
+    // FIXME: only works for little endian right now
+    assert (base.getReg() != target.getReg());
+    two_instructions = true;
+    MCOperand adj_name = LowerOperand(unloweredName, 3);
+    instr1.setOpcode(Mips::SWL);
+    instr1.addOperand(target);
+    instr1.addOperand(base);
+    instr1.addOperand(adj_name);
+    instr2.setOpcode(Mips::SWR);
+    instr2.addOperand(target);
+    instr2.addOperand(base);
+    instr2.addOperand(name);
+    break;
+  }
+  case Mips::USH: {
+    MCOperand adj_name = LowerOperand(unloweredName, 1);
+    instr1.setOpcode(Mips::SB);
+    instr1.addOperand(target);
+    instr1.addOperand(base);
+    instr1.addOperand(name);
+    instr2.setOpcode(Mips::SRL);
+    instr2.addOperand(atReg);
+    instr2.addOperand(target);
+    instr2.addOperand(MCOperand::CreateImm(8));
+    instr3.setOpcode(Mips::SB);
+    instr3.addOperand(atReg);
+    instr3.addOperand(base);
+    instr3.addOperand(adj_name);
+    break;
+  }
+  default:
+    // FIXME: need to add others
+    assert(0 && "unaligned instruction not processed");
+  }
+
+  MCInsts.push_back(instr1);
+  MCInsts.push_back(instr2);
+  if (!two_instructions) MCInsts.push_back(instr3);
+}
+
diff --git a/lib/Target/Mips/MipsMCInstLower.h b/lib/Target/Mips/MipsMCInstLower.h
index 3a24da2..98e37e4 100644
--- a/lib/Target/Mips/MipsMCInstLower.h
+++ b/lib/Target/Mips/MipsMCInstLower.h
@@ -37,10 +37,12 @@ public:
   void Lower(const MachineInstr *MI, MCInst &OutMI) const;
   void LowerCPLOAD(const MachineInstr *MI, SmallVector<MCInst, 4>& MCInsts);
   void LowerCPRESTORE(const MachineInstr *MI, MCInst &OutMI);
+  void LowerUnalignedLoadStore(const MachineInstr *MI,
+		                           SmallVector<MCInst, 4>& MCInsts);
 private:
   MCOperand LowerSymbolOperand(const MachineOperand &MO,
                                MachineOperandType MOTy, unsigned Offset) const;
-  MCOperand LowerOperand(const MachineOperand& MO) const;
+  MCOperand LowerOperand(const MachineOperand& MO, unsigned offset = 0) const;
 };
 }
 
diff --git a/lib/Target/Mips/MipsMCSymbolRefExpr.cpp b/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
deleted file mode 100644
index a0a242c..0000000
--- a/lib/Target/Mips/MipsMCSymbolRefExpr.cpp
+++ /dev/null
@@ -1,70 +0,0 @@
-//===-- MipsMCSymbolRefExpr.cpp - Mips specific MC expression classes -----===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#define DEBUG_TYPE "mipsmcsymbolrefexpr"
-#include "MipsMCSymbolRefExpr.h"
-#include "llvm/MC/MCAssembler.h"
-#include "llvm/MC/MCContext.h"
-#include "llvm/MC/MCSymbol.h"
-using namespace llvm;
-
-const MipsMCSymbolRefExpr*
-MipsMCSymbolRefExpr::Create(VariantKind Kind, const MCSymbol *Symbol,
-                            int Offset, MCContext &Ctx) {
-  return new (Ctx) MipsMCSymbolRefExpr(Kind, Symbol, Offset);
-}
-
-void MipsMCSymbolRefExpr::PrintImpl(raw_ostream &OS) const {
-  switch (Kind) {
-  default: assert(0 && "Invalid kind!");
-  case VK_Mips_None:     break;
-  case VK_Mips_GPREL:    OS << "%gp_rel("; break;
-  case VK_Mips_GOT_CALL: OS << "%call16("; break;
-  case VK_Mips_GOT:      OS << "%got(";    break;
-  case VK_Mips_ABS_HI:   OS << "%hi(";     break;
-  case VK_Mips_ABS_LO:   OS << "%lo(";     break;
-  case VK_Mips_TLSGD:    OS << "%tlsgd(";  break;
-  case VK_Mips_GOTTPREL: OS << "%gottprel("; break;
-  case VK_Mips_TPREL_HI: OS << "%tprel_hi("; break;
-  case VK_Mips_TPREL_LO: OS << "%tprel_lo("; break;
-  case VK_Mips_GPOFF_HI: OS << "%hi(%neg(%gp_rel("; break;
-  case VK_Mips_GPOFF_LO: OS << "%lo(%neg(%gp_rel("; break;
-  case VK_Mips_GOT_DISP: OS << "%got_disp("; break;
-  case VK_Mips_GOT_PAGE: OS << "%got_page("; break;
-  case VK_Mips_GOT_OFST: OS << "%got_ofst("; break;
-  }
-
-  OS << *Symbol;
-
-  if (Offset) {
-    if (Offset > 0)
-      OS << '+';
-    OS << Offset;
-  }
-
-  if (Kind == VK_Mips_GPOFF_HI || Kind == VK_Mips_GPOFF_LO)
-    OS << ")))";
-  else if (Kind != VK_Mips_None)
-    OS << ')';
-}
-
-bool
-MipsMCSymbolRefExpr::EvaluateAsRelocatableImpl(MCValue &Res,
-                                              const MCAsmLayout *Layout) const {
-  return false;
-}
-
-void MipsMCSymbolRefExpr::AddValueSymbols(MCAssembler *Asm) const {
-  Asm->getOrCreateSymbolData(*Symbol);
-}
-
-const MCSection *MipsMCSymbolRefExpr::FindAssociatedSection() const {
-  return Symbol->isDefined() ? &Symbol->getSection() : NULL;
-}
-  
diff --git a/lib/Target/Mips/MipsMCSymbolRefExpr.h b/lib/Target/Mips/MipsMCSymbolRefExpr.h
deleted file mode 100644
index 55e85a7..0000000
--- a/lib/Target/Mips/MipsMCSymbolRefExpr.h
+++ /dev/null
@@ -1,67 +0,0 @@
-//===-- MipsMCSymbolRefExpr.h - Mips specific MCSymbolRefExpr class -------===//
-//
-//                     The LLVM Compiler Infrastructure
-//
-// This file is distributed under the University of Illinois Open Source
-// License. See LICENSE.TXT for details.
-//
-//===----------------------------------------------------------------------===//
-
-#ifndef MIPSMCSYMBOLREFEXPR_H
-#define MIPSMCSYMBOLREFEXPR_H
-#include "llvm/MC/MCExpr.h"
-
-namespace llvm {
-
-class MipsMCSymbolRefExpr : public MCTargetExpr {
-public:
-  enum VariantKind {
-    VK_Mips_None,
-    VK_Mips_GPREL,
-    VK_Mips_GOT_CALL,
-    VK_Mips_GOT,
-    VK_Mips_ABS_HI,
-    VK_Mips_ABS_LO,
-    VK_Mips_TLSGD,
-    VK_Mips_GOTTPREL,
-    VK_Mips_TPREL_HI,
-    VK_Mips_TPREL_LO,
-    VK_Mips_GPOFF_HI,
-    VK_Mips_GPOFF_LO,
-    VK_Mips_GOT_DISP,
-    VK_Mips_GOT_PAGE,
-    VK_Mips_GOT_OFST
-  };
-
-private:
-  const VariantKind Kind;
-  const MCSymbol *Symbol;
-  int Offset;
-
-  explicit MipsMCSymbolRefExpr(VariantKind _Kind, const MCSymbol *_Symbol,
-                               int _Offset)
-    : Kind(_Kind), Symbol(_Symbol), Offset(_Offset) {}
-  
-public:
-  static const MipsMCSymbolRefExpr *Create(VariantKind Kind,
-                                           const MCSymbol *Symbol, int Offset,
-                                           MCContext &Ctx);
-
-  void PrintImpl(raw_ostream &OS) const;
-  bool EvaluateAsRelocatableImpl(MCValue &Res,
-                                 const MCAsmLayout *Layout) const;
-  void AddValueSymbols(MCAssembler *) const;
-  const MCSection *FindAssociatedSection() const;
-
-  static bool classof(const MCExpr *E) {
-    return E->getKind() == MCExpr::Target;
-  }
-
-  static bool classof(const MipsMCSymbolRefExpr *) { return true; }
-
-  int getOffset() const { return Offset; }
-  void setOffset(int O) { Offset = O; }
-};
-} // end namespace llvm
-
-#endif
diff --git a/lib/Target/Mips/MipsRegisterInfo.cpp b/lib/Target/Mips/MipsRegisterInfo.cpp
index 5331f09..06c4a66 100644
--- a/lib/Target/Mips/MipsRegisterInfo.cpp
+++ b/lib/Target/Mips/MipsRegisterInfo.cpp
@@ -285,7 +285,7 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 
   if (MipsFI->isOutArgFI(FrameIndex) || MipsFI->isDynAllocFI(FrameIndex) ||
       (FrameIndex >= MinCSFI && FrameIndex <= MaxCSFI))
-    FrameReg = Mips::SP;
+    FrameReg = Subtarget.isABI_N64() ? Mips::SP_64 : Mips::SP;
   else
     FrameReg = getFrameRegister(MF); 
   
@@ -334,8 +334,10 @@ eliminateFrameIndex(MachineBasicBlock::iterator II, int SPAdj,
 unsigned MipsRegisterInfo::
 getFrameRegister(const MachineFunction &MF) const {
   const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  bool IsN64 = Subtarget.isABI_N64();
 
-  return TFI->hasFP(MF) ? Mips::FP : Mips::SP;
+  return TFI->hasFP(MF) ? (IsN64 ? Mips::FP_64 : Mips::FP) :
+                          (IsN64 ? Mips::SP_64 : Mips::SP);
 }
 
 unsigned MipsRegisterInfo::
diff --git a/lib/Target/Mips/MipsTargetMachine.cpp b/lib/Target/Mips/MipsTargetMachine.cpp
index 6480da3..5d6b24f 100644
--- a/lib/Target/Mips/MipsTargetMachine.cpp
+++ b/lib/Target/Mips/MipsTargetMachine.cpp
@@ -36,8 +36,9 @@ MipsTargetMachine::
 MipsTargetMachine(const Target &T, StringRef TT,
                   StringRef CPU, StringRef FS,
                   Reloc::Model RM, CodeModel::Model CM,
+                  CodeGenOpt::Level OL,
                   bool isLittle):
-  LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+  LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
   Subtarget(TT, CPU, FS, isLittle),
   DataLayout(isLittle ?
              (Subtarget.isABI_N64() ?
@@ -54,31 +55,35 @@ MipsTargetMachine(const Target &T, StringRef TT,
 MipsebTargetMachine::
 MipsebTargetMachine(const Target &T, StringRef TT,
                     StringRef CPU, StringRef FS,
-                    Reloc::Model RM, CodeModel::Model CM) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, false) {}
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {}
 
 MipselTargetMachine::
 MipselTargetMachine(const Target &T, StringRef TT,
                     StringRef CPU, StringRef FS,
-                    Reloc::Model RM, CodeModel::Model CM) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, true) {}
+                    Reloc::Model RM, CodeModel::Model CM,
+                    CodeGenOpt::Level OL) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {}
 
 Mips64ebTargetMachine::
 Mips64ebTargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, false) {}
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {}
 
 Mips64elTargetMachine::
 Mips64elTargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM) :
-  MipsTargetMachine(T, TT, CPU, FS, RM, CM, true) {}
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL) :
+  MipsTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {}
 
 // Install an instruction selector pass using
 // the ISelDag to gen Mips code.
 bool MipsTargetMachine::
-addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
+addInstSelector(PassManagerBase &PM)
 {
   PM.add(createMipsISelDag(*this));
   return false;
@@ -88,14 +93,14 @@ addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
 // machine code is emitted. return true if -print-machineinstrs should
 // print out the code after the passes.
 bool MipsTargetMachine::
-addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel)
+addPreEmitPass(PassManagerBase &PM)
 {
   PM.add(createMipsDelaySlotFillerPass(*this));
   return true;
 }
 
 bool MipsTargetMachine::
-addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+addPreRegAlloc(PassManagerBase &PM) {
   // Do not restore $gp if target is Mips64.
   // In N32/64, $gp is a callee-saved register.
   if (!Subtarget.hasMips64())
@@ -104,14 +109,13 @@ addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
 }
 
 bool MipsTargetMachine::
-addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel) {
+addPostRegAlloc(PassManagerBase &PM) {
   PM.add(createMipsExpandPseudoPass(*this));
   return true;
 }
 
 bool MipsTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                          CodeGenOpt::Level OptLevel,
-                                          JITCodeEmitter &JCE) {
+                                       JITCodeEmitter &JCE) {
   // Machine code emitter pass for Mips.
   PM.add(createMipsJITCodeEmitterPass(*this, JCE));
   return false;
diff --git a/lib/Target/Mips/MipsTargetMachine.h b/lib/Target/Mips/MipsTargetMachine.h
index 118ed10..e40d9e2 100644
--- a/lib/Target/Mips/MipsTargetMachine.h
+++ b/lib/Target/Mips/MipsTargetMachine.h
@@ -40,6 +40,7 @@ namespace llvm {
     MipsTargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
                       Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL,
                       bool isLittle);
 
     virtual const MipsInstrInfo   *getInstrInfo()     const
@@ -67,15 +68,11 @@ namespace llvm {
     }
 
     // Pass Pipeline Configuration
-    virtual bool addInstSelector(PassManagerBase &PM,
-                                 CodeGenOpt::Level OptLevel);
-    virtual bool addPreEmitPass(PassManagerBase &PM,
-                                CodeGenOpt::Level OptLevel);
-    virtual bool addPreRegAlloc(PassManagerBase &PM,
-                                CodeGenOpt::Level OptLevel);
-    virtual bool addPostRegAlloc(PassManagerBase &, CodeGenOpt::Level);
+    virtual bool addInstSelector(PassManagerBase &PM);
+    virtual bool addPreEmitPass(PassManagerBase &PM);
+    virtual bool addPreRegAlloc(PassManagerBase &PM);
+    virtual bool addPostRegAlloc(PassManagerBase &);
     virtual bool addCodeEmitter(PassManagerBase &PM,
-				 CodeGenOpt::Level OptLevel,
 				 JITCodeEmitter &JCE);
 
   };
@@ -86,7 +83,8 @@ class MipsebTargetMachine : public MipsTargetMachine {
 public:
   MipsebTargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM);
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
 };
 
 /// MipselTargetMachine - Mips32 little endian target machine.
@@ -95,7 +93,8 @@ class MipselTargetMachine : public MipsTargetMachine {
 public:
   MipselTargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM);
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
 };
 
 /// Mips64ebTargetMachine - Mips64 big endian target machine.
@@ -104,7 +103,8 @@ class Mips64ebTargetMachine : public MipsTargetMachine {
 public:
   Mips64ebTargetMachine(const Target &T, StringRef TT,
                         StringRef CPU, StringRef FS,
-                        Reloc::Model RM, CodeModel::Model CM);
+                        Reloc::Model RM, CodeModel::Model CM,
+                        CodeGenOpt::Level OL);
 };
 
 /// Mips64elTargetMachine - Mips64 little endian target machine.
@@ -113,7 +113,8 @@ class Mips64elTargetMachine : public MipsTargetMachine {
 public:
   Mips64elTargetMachine(const Target &T, StringRef TT,
                         StringRef CPU, StringRef FS,
-                        Reloc::Model RM, CodeModel::Model CM);
+                        Reloc::Model RM, CodeModel::Model CM,
+                        CodeGenOpt::Level OL);
 };
 } // End llvm namespace
 
diff --git a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
index a5af3b8..09f86b5 100644
--- a/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
+++ b/lib/Target/PTX/MCTargetDesc/PTXMCTargetDesc.cpp
@@ -52,9 +52,10 @@ static MCSubtargetInfo *createPTXMCSubtargetInfo(StringRef TT, StringRef CPU,
 }
 
 static MCCodeGenInfo *createPTXMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM) {
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/PTX/PTXTargetMachine.cpp b/lib/Target/PTX/PTXTargetMachine.cpp
index 50dd417..292ea5e 100644
--- a/lib/Target/PTX/PTXTargetMachine.cpp
+++ b/lib/Target/PTX/PTXTargetMachine.cpp
@@ -88,8 +88,9 @@ namespace {
 PTXTargetMachine::PTXTargetMachine(const Target &T,
                                    StringRef TT, StringRef CPU, StringRef FS,
                                    Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     DataLayout(is64Bit ? DataLayout64 : DataLayout32),
     Subtarget(TT, CPU, FS, is64Bit),
     FrameLowering(Subtarget),
@@ -100,39 +101,38 @@ PTXTargetMachine::PTXTargetMachine(const Target &T,
 
 PTX32TargetMachine::PTX32TargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
-                                       Reloc::Model RM, CodeModel::Model CM)
-  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, false) {
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {
 }
 
 PTX64TargetMachine::PTX64TargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
-                                       Reloc::Model RM, CodeModel::Model CM)
-  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, true) {
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PTXTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {
 }
 
-bool PTXTargetMachine::addInstSelector(PassManagerBase &PM,
-                                       CodeGenOpt::Level OptLevel) {
-  PM.add(createPTXISelDag(*this, OptLevel));
+bool PTXTargetMachine::addInstSelector(PassManagerBase &PM) {
+  PM.add(createPTXISelDag(*this, getOptLevel()));
   return false;
 }
 
-bool PTXTargetMachine::addPostRegAlloc(PassManagerBase &PM,
-                                       CodeGenOpt::Level OptLevel) {
+bool PTXTargetMachine::addPostRegAlloc(PassManagerBase &PM) {
   // PTXMFInfoExtract must after register allocation!
-  //PM.add(createPTXMFInfoExtract(*this, OptLevel));
+  //PM.add(createPTXMFInfoExtract(*this));
   return false;
 }
 
 bool PTXTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
                                            formatted_raw_ostream &Out,
                                            CodeGenFileType FileType,
-                                           CodeGenOpt::Level OptLevel,
                                            bool DisableVerify) {
   // This is mostly based on LLVMTargetMachine::addPassesToEmitFile
 
   // Add common CodeGen passes.
   MCContext *Context = 0;
-  if (addCommonCodeGenPasses(PM, OptLevel, DisableVerify, Context))
+  if (addCommonCodeGenPasses(PM, DisableVerify, Context))
     return true;
   assert(Context != 0 && "Failed to get MCContext");
 
@@ -192,7 +192,6 @@ bool PTXTargetMachine::addPassesToEmitFile(PassManagerBase &PM,
 }
 
 bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
-                                              CodeGenOpt::Level OptLevel,
                                               bool DisableVerify,
                                               MCContext *&OutContext) {
   // Add standard LLVM codegen passes.
@@ -214,7 +213,7 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
     PM.add(createVerifierPass());
 
   // Run loop strength reduction before anything else.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     PM.add(createLoopStrengthReducePass(getTargetLowering()));
     //PM.add(createPrintFunctionPass("\n\n*** Code after LSR ***\n", &dbgs()));
   }
@@ -228,12 +227,12 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   // The lower invoke pass may create unreachable code. Remove it.
   PM.add(createUnreachableBlockEliminationPass());
 
-  if (OptLevel != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None)
     PM.add(createCodeGenPreparePass(getTargetLowering()));
 
   PM.add(createStackProtectorPass(getTargetLowering()));
 
-  addPreISel(PM, OptLevel);
+  addPreISel(PM);
 
   //PM.add(createPrintFunctionPass("\n\n"
   //                               "*** Final LLVM Code input to ISel ***\n",
@@ -255,10 +254,10 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   OutContext = &MMI->getContext(); // Return the MCContext specifically by-ref.
 
   // Set up a MachineFunction for the rest of CodeGen to work on.
-  PM.add(new MachineFunctionAnalysis(*this, OptLevel));
+  PM.add(new MachineFunctionAnalysis(*this));
 
   // Ask the target for an isel.
-  if (addInstSelector(PM, OptLevel))
+  if (addInstSelector(PM))
     return true;
 
   // Print the instruction selected machine code...
@@ -268,21 +267,21 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   PM.add(createExpandISelPseudosPass());
 
   // Pre-ra tail duplication.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     PM.add(createTailDuplicatePass(true));
     printAndVerify(PM, "After Pre-RegAlloc TailDuplicate");
   }
 
   // Optimize PHIs before DCE: removing dead PHI cycles may make more
   // instructions dead.
-  if (OptLevel != CodeGenOpt::None)
+  if (getOptLevel() != CodeGenOpt::None)
     PM.add(createOptimizePHIsPass());
 
   // If the target requests it, assign local variables to stack slots relative
   // to one another and simplify frame index references where possible.
   PM.add(createLocalStackSlotAllocationPass());
 
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     // With optimization, dead code should already be eliminated. However
     // there is one known exception: lowered code for arguments that are only
     // used by tail calls, where the tail calls reuse the incoming stack
@@ -300,7 +299,7 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   }
 
   // Run pre-ra passes.
-  if (addPreRegAlloc(PM, OptLevel))
+  if (addPreRegAlloc(PM))
     printAndVerify(PM, "After PreRegAlloc passes");
 
   // Perform register allocation.
@@ -308,7 +307,7 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   printAndVerify(PM, "After Register Allocation");
 
   // Perform stack slot coloring and post-ra machine LICM.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     // FIXME: Re-enable coloring with register when it's capable of adding
     // kill markers.
     PM.add(createStackSlotColoringPass(false));
@@ -322,7 +321,7 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   }
 
   // Run post-ra passes.
-  if (addPostRegAlloc(PM, OptLevel))
+  if (addPostRegAlloc(PM))
     printAndVerify(PM, "After PostRegAlloc passes");
 
   PM.add(createExpandPostRAPseudosPass());
@@ -333,23 +332,23 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   printAndVerify(PM, "After PrologEpilogCodeInserter");
 
   // Run pre-sched2 passes.
-  if (addPreSched2(PM, OptLevel))
+  if (addPreSched2(PM))
     printAndVerify(PM, "After PreSched2 passes");
 
   // Second pass scheduler.
-  if (OptLevel != CodeGenOpt::None) {
-    PM.add(createPostRAScheduler(OptLevel));
+  if (getOptLevel() != CodeGenOpt::None) {
+    PM.add(createPostRAScheduler(getOptLevel()));
     printAndVerify(PM, "After PostRAScheduler");
   }
 
   // Branch folding must be run after regalloc and prolog/epilog insertion.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     PM.add(createBranchFoldingPass(getEnableTailMergeDefault()));
     printNoVerify(PM, "After BranchFolding");
   }
 
   // Tail duplication.
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     PM.add(createTailDuplicatePass(false));
     printNoVerify(PM, "After TailDuplicate");
   }
@@ -359,16 +358,16 @@ bool PTXTargetMachine::addCommonCodeGenPasses(PassManagerBase &PM,
   //if (PrintGCInfo)
   //  PM.add(createGCInfoPrinter(dbgs()));
 
-  if (OptLevel != CodeGenOpt::None) {
+  if (getOptLevel() != CodeGenOpt::None) {
     PM.add(createCodePlacementOptPass());
     printNoVerify(PM, "After CodePlacementOpt");
   }
 
-  if (addPreEmitPass(PM, OptLevel))
+  if (addPreEmitPass(PM))
     printNoVerify(PM, "After PreEmit passes");
 
-  PM.add(createPTXMFInfoExtract(*this, OptLevel));
-  PM.add(createPTXFPRoundingModePass(*this, OptLevel));
+  PM.add(createPTXMFInfoExtract(*this, getOptLevel()));
+  PM.add(createPTXFPRoundingModePass(*this, getOptLevel()));
 
   return false;
 }
diff --git a/lib/Target/PTX/PTXTargetMachine.h b/lib/Target/PTX/PTXTargetMachine.h
index 5b7c82b..19f6c0f 100644
--- a/lib/Target/PTX/PTXTargetMachine.h
+++ b/lib/Target/PTX/PTXTargetMachine.h
@@ -37,6 +37,7 @@ class PTXTargetMachine : public LLVMTargetMachine {
     PTXTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
                      Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL,
                      bool is64Bit);
 
     virtual const TargetData *getTargetData() const { return &DataLayout; }
@@ -58,22 +59,18 @@ class PTXTargetMachine : public LLVMTargetMachine {
 
     virtual const PTXSubtarget *getSubtargetImpl() const { return &Subtarget; }
 
-    virtual bool addInstSelector(PassManagerBase &PM,
-                                 CodeGenOpt::Level OptLevel);
-    virtual bool addPostRegAlloc(PassManagerBase &PM,
-                                 CodeGenOpt::Level OptLevel);
+    virtual bool addInstSelector(PassManagerBase &PM);
+    virtual bool addPostRegAlloc(PassManagerBase &PM);
 
     // We override this method to supply our own set of codegen passes.
     virtual bool addPassesToEmitFile(PassManagerBase &,
                                      formatted_raw_ostream &,
                                      CodeGenFileType,
-                                     CodeGenOpt::Level,
                                      bool = true);
 
     // Emission of machine code through JITCodeEmitter is not supported.
     virtual bool addPassesToEmitMachineCode(PassManagerBase &,
                                             JITCodeEmitter &,
-                                            CodeGenOpt::Level,
                                             bool = true) {
       return true;
     }
@@ -82,14 +79,13 @@ class PTXTargetMachine : public LLVMTargetMachine {
     virtual bool addPassesToEmitMC(PassManagerBase &,
                                    MCContext *&,
                                    raw_ostream &,
-                                   CodeGenOpt::Level,
                                    bool = true) {
       return true;
     }
 
   private:
 
-    bool addCommonCodeGenPasses(PassManagerBase &, CodeGenOpt::Level,
+    bool addCommonCodeGenPasses(PassManagerBase &,
                                 bool DisableVerify, MCContext *&OutCtx);
 }; // class PTXTargetMachine
 
@@ -99,7 +95,8 @@ public:
 
   PTX32TargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 }; // class PTX32TargetMachine
 
 class PTX64TargetMachine : public PTXTargetMachine {
@@ -107,7 +104,8 @@ public:
 
   PTX64TargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 }; // class PTX32TargetMachine
 
 } // namespace llvm
diff --git a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
index d5c8a9e..7c47051 100644
--- a/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
+++ b/lib/Target/PowerPC/MCTargetDesc/PPCMCTargetDesc.cpp
@@ -76,7 +76,8 @@ static MCAsmInfo *createPPCMCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM) {
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
   if (RM == Reloc::Default) {
@@ -86,7 +87,7 @@ static MCCodeGenInfo *createPPCMCCodeGenInfo(StringRef TT, Reloc::Model RM,
     else
       RM = Reloc::Static;
   }
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
index 6f204cc..3dee406 100644
--- a/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
+++ b/lib/Target/PowerPC/PPCISelDAGToDAG.cpp
@@ -18,7 +18,6 @@
 #include "MCTargetDesc/PPCPredicates.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineFunction.h"
-#include "llvm/CodeGen/MachineFunctionAnalysis.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/SelectionDAGISel.h"
diff --git a/lib/Target/PowerPC/PPCISelLowering.cpp b/lib/Target/PowerPC/PPCISelLowering.cpp
index b188b90..36d5c41 100644
--- a/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -23,7 +23,6 @@
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/CodeGen/SelectionDAG.h"
 #include "llvm/CodeGen/TargetLoweringObjectFileImpl.h"
 #include "llvm/CallingConv.h"
@@ -408,6 +407,8 @@ PPCTargetLowering::PPCTargetLowering(PPCTargetMachine &TM)
 
   setInsertFencesForAtomic(true);
 
+  setSchedulingPreference(Sched::Hybrid);
+
   computeRegisterProperties();
 }
 
diff --git a/lib/Target/PowerPC/PPCInstrInfo.cpp b/lib/Target/PowerPC/PPCInstrInfo.cpp
index f148e9d..b9a6297 100644
--- a/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -22,7 +22,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -57,11 +56,8 @@ ScheduleHazardRecognizer *PPCInstrInfo::CreateTargetHazardRecognizer(
 
   unsigned Directive = TM->getSubtarget<PPCSubtarget>().getDarwinDirective();
   if (Directive == PPC::DIR_440) {
-    // Disable the hazard recognizer for now, as it doesn't support
-    // bottom-up scheduling.
-    //const InstrItineraryData *II = TM->getInstrItineraryData();
-    //return new PPCHazardRecognizer440(II, DAG);
-    return new ScheduleHazardRecognizer();
+    const InstrItineraryData *II = TM->getInstrItineraryData();
+    return new PPCHazardRecognizer440(II, DAG);
   }
   else {
     // Disable the hazard recognizer for now, as it doesn't support
@@ -501,8 +497,7 @@ PPCInstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
 
   const MachineFrameInfo &MFI = *MF.getFrameInfo();
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(
-                MachinePointerInfo(PseudoSourceValue::getFixedStack(FrameIdx)),
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
                             MachineMemOperand::MOStore,
                             MFI.getObjectSize(FrameIdx),
                             MFI.getObjectAlignment(FrameIdx));
@@ -623,8 +618,7 @@ PPCInstrInfo::loadRegFromStackSlot(MachineBasicBlock &MBB,
 
   const MachineFrameInfo &MFI = *MF.getFrameInfo();
   MachineMemOperand *MMO =
-    MF.getMachineMemOperand(
-                MachinePointerInfo(PseudoSourceValue::getFixedStack(FrameIdx)),
+    MF.getMachineMemOperand(MachinePointerInfo::getFixedStack(FrameIdx),
                             MachineMemOperand::MOLoad,
                             MFI.getObjectSize(FrameIdx),
                             MFI.getObjectAlignment(FrameIdx));
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.cpp b/lib/Target/PowerPC/PPCRegisterInfo.cpp
index 2e90b7a..3ba9260 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.cpp
+++ b/lib/Target/PowerPC/PPCRegisterInfo.cpp
@@ -273,6 +273,27 @@ BitVector PPCRegisterInfo::getReservedRegs(const MachineFunction &MF) const {
   return Reserved;
 }
 
+unsigned
+PPCRegisterInfo::getRegPressureLimit(const TargetRegisterClass *RC,
+                                         MachineFunction &MF) const {
+  const TargetFrameLowering *TFI = MF.getTarget().getFrameLowering();
+  const unsigned DefaultSafety = 1;
+
+  switch (RC->getID()) {
+  default:
+    return 0;
+  case PPC::G8RCRegClassID:
+  case PPC::GPRCRegClassID: {
+    unsigned FP = TFI->hasFP(MF) ? 1 : 0;
+    return 32 - FP - DefaultSafety;
+  }
+  case PPC::F8RCRegClassID:
+  case PPC::F4RCRegClassID:
+  case PPC::VRRCRegClassID:
+    return 32 - DefaultSafety;
+  }
+}
+
 //===----------------------------------------------------------------------===//
 // Stack Frame Processing methods
 //===----------------------------------------------------------------------===//
diff --git a/lib/Target/PowerPC/PPCRegisterInfo.h b/lib/Target/PowerPC/PPCRegisterInfo.h
index 1cc7213..f70a594 100644
--- a/lib/Target/PowerPC/PPCRegisterInfo.h
+++ b/lib/Target/PowerPC/PPCRegisterInfo.h
@@ -37,6 +37,9 @@ public:
   /// This is used for addressing modes.
   virtual const TargetRegisterClass *getPointerRegClass(unsigned Kind=0) const;  
 
+  unsigned getRegPressureLimit(const TargetRegisterClass *RC,
+                               MachineFunction &MF) const;
+
   /// Code Generation virtual methods...
   const unsigned *getCalleeSavedRegs(const MachineFunction* MF = 0) const;
 
diff --git a/lib/Target/PowerPC/PPCTargetMachine.cpp b/lib/Target/PowerPC/PPCTargetMachine.cpp
index f5744b83..de8fca0 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -29,8 +29,9 @@ extern "C" void LLVMInitializePowerPCTarget() {
 PPCTargetMachine::PPCTargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
                                    Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64Bit),
     DataLayout(Subtarget.getTargetDataString()), InstrInfo(*this),
     FrameLowering(Subtarget), JITInfo(*this, is64Bit),
@@ -44,15 +45,17 @@ bool PPCTargetMachine::getEnableTailMergeDefault() const { return false; }
 
 PPC32TargetMachine::PPC32TargetMachine(const Target &T, StringRef TT, 
                                        StringRef CPU, StringRef FS,
-                                       Reloc::Model RM, CodeModel::Model CM) 
-  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, false) {
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {
 }
 
 
 PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT, 
                                        StringRef CPU,  StringRef FS,
-                                       Reloc::Model RM, CodeModel::Model CM)
-  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, true) {
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : PPCTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {
 }
 
 
@@ -60,22 +63,19 @@ PPC64TargetMachine::PPC64TargetMachine(const Target &T, StringRef TT,
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
-bool PPCTargetMachine::addInstSelector(PassManagerBase &PM,
-                                       CodeGenOpt::Level OptLevel) {
+bool PPCTargetMachine::addInstSelector(PassManagerBase &PM) {
   // Install an instruction selector.
   PM.add(createPPCISelDag(*this));
   return false;
 }
 
-bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                      CodeGenOpt::Level OptLevel) {
+bool PPCTargetMachine::addPreEmitPass(PassManagerBase &PM) {
   // Must run branch selection immediately preceding the asm printer.
   PM.add(createPPCBranchSelectionPass());
   return false;
 }
 
 bool PPCTargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                      CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
   // FIXME: This should be moved to TargetJITInfo!!
   if (Subtarget.isPPC64())
diff --git a/lib/Target/PowerPC/PPCTargetMachine.h b/lib/Target/PowerPC/PPCTargetMachine.h
index d06f084..03b27c6 100644
--- a/lib/Target/PowerPC/PPCTargetMachine.h
+++ b/lib/Target/PowerPC/PPCTargetMachine.h
@@ -42,7 +42,8 @@ class PPCTargetMachine : public LLVMTargetMachine {
 public:
   PPCTargetMachine(const Target &T, StringRef TT,
                    StringRef CPU, StringRef FS,
-                   Reloc::Model RM, CodeModel::Model CM, bool is64Bit);
+                   Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL, bool is64Bit);
 
   virtual const PPCInstrInfo      *getInstrInfo() const { return &InstrInfo; }
   virtual const PPCFrameLowering  *getFrameLowering() const {
@@ -66,9 +67,9 @@ public:
   }
 
   // Pass Pipeline Configuration
-  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM,
                               JITCodeEmitter &JCE);
   virtual bool getEnableTailMergeDefault() const;
 };
@@ -79,7 +80,8 @@ class PPC32TargetMachine : public PPCTargetMachine {
 public:
   PPC32TargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 };
 
 /// PPC64TargetMachine - PowerPC 64-bit target machine.
@@ -88,7 +90,8 @@ class PPC64TargetMachine : public PPCTargetMachine {
 public:
   PPC64TargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
index cb2a7df..eda04c3 100644
--- a/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
+++ b/lib/Target/Sparc/MCTargetDesc/SparcMCTargetDesc.cpp
@@ -50,9 +50,10 @@ static MCSubtargetInfo *createSparcMCSubtargetInfo(StringRef TT, StringRef CPU,
 }
 
 static MCCodeGenInfo *createSparcMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM) {
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/Sparc/SparcTargetMachine.cpp b/lib/Target/Sparc/SparcTargetMachine.cpp
index 3d7b4a4..7dff799 100644
--- a/lib/Target/Sparc/SparcTargetMachine.cpp
+++ b/lib/Target/Sparc/SparcTargetMachine.cpp
@@ -27,16 +27,16 @@ extern "C" void LLVMInitializeSparcTarget() {
 SparcTargetMachine::SparcTargetMachine(const Target &T, StringRef TT, 
                                        StringRef CPU, StringRef FS,
                                        Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL,
                                        bool is64bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS, is64bit),
     DataLayout(Subtarget.getDataLayout()),
     TLInfo(*this), TSInfo(*this), InstrInfo(Subtarget),
     FrameLowering(Subtarget) {
 }
 
-bool SparcTargetMachine::addInstSelector(PassManagerBase &PM,
-                                         CodeGenOpt::Level OptLevel) {
+bool SparcTargetMachine::addInstSelector(PassManagerBase &PM) {
   PM.add(createSparcISelDag(*this));
   return false;
 }
@@ -44,8 +44,7 @@ bool SparcTargetMachine::addInstSelector(PassManagerBase &PM,
 /// addPreEmitPass - This pass may be implemented by targets that want to run
 /// passes immediately before machine code is emitted.  This should return
 /// true if -print-machineinstrs should print out the code after the passes.
-bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                        CodeGenOpt::Level OptLevel){
+bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM){
   PM.add(createSparcFPMoverPass(*this));
   PM.add(createSparcDelaySlotFillerPass(*this));
   return true;
@@ -54,13 +53,15 @@ bool SparcTargetMachine::addPreEmitPass(PassManagerBase &PM,
 SparcV8TargetMachine::SparcV8TargetMachine(const Target &T,
                                            StringRef TT, StringRef CPU,
                                            StringRef FS, Reloc::Model RM,
-                                           CodeModel::Model CM)
-  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, false) {
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, OL, false) {
 }
 
 SparcV9TargetMachine::SparcV9TargetMachine(const Target &T, 
                                            StringRef TT,  StringRef CPU,
                                            StringRef FS, Reloc::Model RM,
-                                           CodeModel::Model CM)
-  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, true) {
+                                           CodeModel::Model CM,
+                                           CodeGenOpt::Level OL)
+  : SparcTargetMachine(T, TT, CPU, FS, RM, CM, OL, true) {
 }
diff --git a/lib/Target/Sparc/SparcTargetMachine.h b/lib/Target/Sparc/SparcTargetMachine.h
index 3c907dd..63bfa5d 100644
--- a/lib/Target/Sparc/SparcTargetMachine.h
+++ b/lib/Target/Sparc/SparcTargetMachine.h
@@ -35,7 +35,8 @@ class SparcTargetMachine : public LLVMTargetMachine {
 public:
   SparcTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM, bool is64bit);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL,  bool is64bit);
 
   virtual const SparcInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const TargetFrameLowering  *getFrameLowering() const {
@@ -54,8 +55,8 @@ public:
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
 
   // Pass Pipeline Configuration
-  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &PM);
 };
 
 /// SparcV8TargetMachine - Sparc 32-bit target machine
@@ -64,7 +65,8 @@ class SparcV8TargetMachine : public SparcTargetMachine {
 public:
   SparcV8TargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
-                       Reloc::Model RM, CodeModel::Model CM);
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
 };
 
 /// SparcV9TargetMachine - Sparc 64-bit target machine
@@ -73,7 +75,8 @@ class SparcV9TargetMachine : public SparcTargetMachine {
 public:
   SparcV9TargetMachine(const Target &T, StringRef TT,
                        StringRef CPU, StringRef FS,
-                       Reloc::Model RM, CodeModel::Model CM);
+                       Reloc::Model RM, CodeModel::Model CM,
+                       CodeGenOpt::Level OL);
 };
 
 } // end namespace llvm
diff --git a/lib/Target/TargetLibraryInfo.cpp b/lib/Target/TargetLibraryInfo.cpp
index 709dfd2..aa2e014 100644
--- a/lib/Target/TargetLibraryInfo.cpp
+++ b/lib/Target/TargetLibraryInfo.cpp
@@ -20,6 +20,19 @@ INITIALIZE_PASS(TargetLibraryInfo, "targetlibinfo",
                 "Target Library Information", false, true)
 char TargetLibraryInfo::ID = 0;
 
+const char* TargetLibraryInfo::StandardNames[LibFunc::NumLibFuncs] =
+  {
+    "memset",
+    "memcpy",
+    "memmove",
+    "memset_pattern16",
+    "iprintf",
+    "siprintf",
+    "fiprintf",
+    "fwrite",
+    "fputs"
+  };
+
 /// initialize - Initialize the set of available library functions based on the
 /// specified target triple.  This should be carefully written so that a missing
 /// target triple gets a sane set of defaults.
@@ -38,6 +51,17 @@ static void initialize(TargetLibraryInfo &TLI, const Triple &T) {
     TLI.setUnavailable(LibFunc::memset_pattern16);
   }
 
+  if (T.isMacOSX() && T.getArch() == Triple::x86 &&
+      !T.isMacOSXVersionLT(10, 7)) {
+    // x86-32 OSX has a scheme where fwrite and fputs (and some other functions
+    // we don't care about) have two versions; on recent OSX, the one we want
+    // has a $UNIX2003 suffix. The two implementations are identical except
+    // for the return value in some edge cases.  However, we don't want to
+    // generate code that depends on the old symbols.
+    TLI.setAvailableWithName(LibFunc::fwrite, "fwrite$UNIX2003");
+    TLI.setAvailableWithName(LibFunc::fputs, "fputs$UNIX2003");
+  }
+
   // iprintf and friends are only available on XCore and TCE.
   if (T.getArch() != Triple::xcore && T.getArch() != Triple::tce) {
     TLI.setUnavailable(LibFunc::iprintf);
@@ -64,6 +88,7 @@ TargetLibraryInfo::TargetLibraryInfo(const Triple &T) : ImmutablePass(ID) {
 TargetLibraryInfo::TargetLibraryInfo(const TargetLibraryInfo &TLI)
   : ImmutablePass(ID) {
   memcpy(AvailableArray, TLI.AvailableArray, sizeof(AvailableArray));
+  CustomNames = TLI.CustomNames;
 }
 
 
diff --git a/lib/Target/TargetMachine.cpp b/lib/Target/TargetMachine.cpp
index daac924..805e16e 100644
--- a/lib/Target/TargetMachine.cpp
+++ b/lib/Target/TargetMachine.cpp
@@ -226,6 +226,14 @@ CodeModel::Model TargetMachine::getCodeModel() const {
   return CodeGenInfo->getCodeModel();
 }
 
+/// getOptLevel - Returns the optimization level: None, Less,
+/// Default, or Aggressive.
+CodeGenOpt::Level TargetMachine::getOptLevel() const {
+  if (!CodeGenInfo)
+    return CodeGenOpt::Default;
+  return CodeGenInfo->getOptLevel();
+}
+
 bool TargetMachine::getAsmVerbosityDefault() {
   return AsmVerbosityDefault;
 }
diff --git a/lib/Target/X86/InstPrinter/X86InstComments.cpp b/lib/Target/X86/InstPrinter/X86InstComments.cpp
index 8d85b95..6e87efa 100644
--- a/lib/Target/X86/InstPrinter/X86InstComments.cpp
+++ b/lib/Target/X86/InstPrinter/X86InstComments.cpp
@@ -34,6 +34,12 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
 
   switch (MI->getOpcode()) {
   case X86::INSERTPSrr:
+    Src1Name = getRegName(MI->getOperand(0).getReg());
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
+    break;
+  case X86::VINSERTPSrr:
+    DestName = getRegName(MI->getOperand(0).getReg());
     Src1Name = getRegName(MI->getOperand(1).getReg());
     Src2Name = getRegName(MI->getOperand(2).getReg());
     DecodeINSERTPSMask(MI->getOperand(3).getImm(), ShuffleMask);
@@ -44,34 +50,52 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     Src1Name = getRegName(MI->getOperand(0).getReg());
     DecodeMOVLHPSMask(2, ShuffleMask);
     break;
+  case X86::VMOVLHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVLHPSMask(2, ShuffleMask);
+    break;
 
   case X86::MOVHLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     Src1Name = getRegName(MI->getOperand(0).getReg());
     DecodeMOVHLPSMask(2, ShuffleMask);
     break;
+  case X86::VMOVHLPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    DecodeMOVHLPSMask(2, ShuffleMask);
+    break;
 
   case X86::PSHUFDri:
+  case X86::VPSHUFDri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::PSHUFDmi:
+  case X86::VPSHUFDmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodePSHUFMask(4, MI->getOperand(MI->getNumOperands()-1).getImm(),
                     ShuffleMask);
     break;
 
   case X86::PSHUFHWri:
+  case X86::VPSHUFHWri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::PSHUFHWmi:
+  case X86::VPSHUFHWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodePSHUFHWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
                       ShuffleMask);
     break;
   case X86::PSHUFLWri:
+  case X86::VPSHUFLWri:
     Src1Name = getRegName(MI->getOperand(1).getReg());
     // FALL THROUGH.
   case X86::PSHUFLWmi:
+  case X86::VPSHUFLWmi:
     DestName = getRegName(MI->getOperand(0).getReg());
     DecodePSHUFLWMask(MI->getOperand(MI->getNumOperands()-1).getImm(),
                       ShuffleMask);
@@ -142,6 +166,14 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VSHUFPDrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPDrmi:
+    DecodeSHUFPSMask(2, MI->getOperand(3).getImm(), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
 
   case X86::SHUFPSrri:
     Src2Name = getRegName(MI->getOperand(2).getReg());
@@ -150,63 +182,107 @@ void llvm::EmitAnyX86InstComments(const MCInst *MI, raw_ostream &OS,
     DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VSHUFPSrri:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VSHUFPSrmi:
+    DecodeSHUFPSMask(4, MI->getOperand(3).getImm(), ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
 
   case X86::UNPCKLPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKLPDrm:
-    DecodeUNPCKLPDMask(2, ShuffleMask);
+    DecodeUNPCKLPMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKLPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPDrm:
-    DecodeUNPCKLPDMask(2, ShuffleMask);
+    DecodeUNPCKLPMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKLPDYrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPDYrm:
-    DecodeUNPCKLPDMask(4, ShuffleMask);
+    DecodeUNPCKLPMask(MVT::v4f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::UNPCKLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKLPSrm:
-    DecodeUNPCKLPSMask(4, ShuffleMask);
+    DecodeUNPCKLPMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKLPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPSrm:
-    DecodeUNPCKLPSMask(4, ShuffleMask);
+    DecodeUNPCKLPMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::VUNPCKLPSYrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::VUNPCKLPSYrm:
-    DecodeUNPCKLPSMask(8, ShuffleMask);
+    DecodeUNPCKLPMask(MVT::v8f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
     break;
   case X86::UNPCKHPDrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKHPDrm:
-    DecodeUNPCKHPMask(2, ShuffleMask);
+    DecodeUNPCKHPMask(MVT::v2f64, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VUNPCKHPDrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPDrm:
+    DecodeUNPCKHPMask(MVT::v2f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPDYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPDYrm:
+    DecodeUNPCKLPMask(MVT::v4f64, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
   case X86::UNPCKHPSrr:
     Src2Name = getRegName(MI->getOperand(2).getReg());
     // FALL THROUGH.
   case X86::UNPCKHPSrm:
-    DecodeUNPCKHPMask(4, ShuffleMask);
+    DecodeUNPCKHPMask(MVT::v4f32, ShuffleMask);
     Src1Name = getRegName(MI->getOperand(0).getReg());
     break;
+  case X86::VUNPCKHPSrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPSrm:
+    DecodeUNPCKHPMask(MVT::v4f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
+  case X86::VUNPCKHPSYrr:
+    Src2Name = getRegName(MI->getOperand(2).getReg());
+    // FALL THROUGH.
+  case X86::VUNPCKHPSYrm:
+    DecodeUNPCKHPMask(MVT::v8f32, ShuffleMask);
+    Src1Name = getRegName(MI->getOperand(1).getReg());
+    DestName = getRegName(MI->getOperand(0).getReg());
+    break;
   case X86::VPERMILPSri:
     DecodeVPERMILPSMask(4, MI->getOperand(2).getImm(),
                         ShuffleMask);
diff --git a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
index 03c3948..a843515 100644
--- a/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
+++ b/lib/Target/X86/MCTargetDesc/X86MCTargetDesc.cpp
@@ -385,7 +385,8 @@ static MCAsmInfo *createX86MCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                             CodeModel::Model CM) {
+                                             CodeModel::Model CM,
+                                             CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
 
   Triple T(TT);
@@ -429,7 +430,7 @@ static MCCodeGenInfo *createX86MCCodeGenInfo(StringRef TT, Reloc::Model RM,
     // 64-bit JIT places everything in the same buffer except external funcs.
     CM = is64Bit ? CodeModel::Large : CodeModel::Small;
 
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.cpp b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
index aeb3309..f6c9d7b 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.cpp
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.cpp
@@ -142,29 +142,29 @@ void DecodeSHUFPSMask(unsigned NElts, unsigned Imm,
   }
 }
 
-void DecodeUNPCKHPMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
-  for (unsigned i = 0; i != NElts/2; ++i) {
-    ShuffleMask.push_back(i+NElts/2);        // Reads from dest
-    ShuffleMask.push_back(i+NElts+NElts/2);  // Reads from src
-  }
-}
+void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
+  unsigned NumElts = VT.getVectorNumElements();
 
-void DecodeUNPCKLPSMask(unsigned NElts,
-                        SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i32, NElts), ShuffleMask);
-}
+  // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
+  // independently on 128-bit lanes.
+  unsigned NumLanes = VT.getSizeInBits() / 128;
+  if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
+  unsigned NumLaneElts = NumElts / NumLanes;
 
-void DecodeUNPCKLPDMask(unsigned NElts,
-                        SmallVectorImpl<unsigned> &ShuffleMask) {
-  DecodeUNPCKLPMask(MVT::getVectorVT(MVT::i64, NElts), ShuffleMask);
+  for (unsigned s = 0; s < NumLanes; ++s) {
+    unsigned Start = s * NumLaneElts + NumLaneElts/2;
+    unsigned End   = s * NumLaneElts + NumLaneElts;
+    for (unsigned i = Start; i != End; ++i) {
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
+    }
+  }
 }
 
 /// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// etc.  VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKLPMask(EVT VT,
-                       SmallVectorImpl<unsigned> &ShuffleMask) {
+void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask) {
   unsigned NumElts = VT.getVectorNumElements();
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -173,16 +173,13 @@ void DecodeUNPCKLPMask(EVT VT,
   if (NumLanes == 0 ) NumLanes = 1;  // Handle MMX
   unsigned NumLaneElts = NumElts / NumLanes;
 
-  unsigned Start = 0;
-  unsigned End = NumLaneElts / 2;
   for (unsigned s = 0; s < NumLanes; ++s) {
+    unsigned Start = s * NumLaneElts;
+    unsigned End   = s * NumLaneElts + NumLaneElts/2;
     for (unsigned i = Start; i != End; ++i) {
-      ShuffleMask.push_back(i);                 // Reads from dest/src1
-      ShuffleMask.push_back(i+NumLaneElts);  // Reads from src/src2
+      ShuffleMask.push_back(i);          // Reads from dest/src1
+      ShuffleMask.push_back(i+NumElts);  // Reads from src/src2
     }
-    // Process the next 128 bits.
-    Start += NumLaneElts;
-    End += NumLaneElts;
   }
 }
 
diff --git a/lib/Target/X86/Utils/X86ShuffleDecode.h b/lib/Target/X86/Utils/X86ShuffleDecode.h
index 58193e6..35f6530 100644
--- a/lib/Target/X86/Utils/X86ShuffleDecode.h
+++ b/lib/Target/X86/Utils/X86ShuffleDecode.h
@@ -67,20 +67,15 @@ void DecodePUNPCKHMask(unsigned NElts,
 void DecodeSHUFPSMask(unsigned NElts, unsigned Imm,
                       SmallVectorImpl<unsigned> &ShuffleMask);
 
-void DecodeUNPCKHPMask(unsigned NElts,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodeUNPCKLPSMask(unsigned NElts,
-                        SmallVectorImpl<unsigned> &ShuffleMask);
-
-void DecodeUNPCKLPDMask(unsigned NElts,
-                        SmallVectorImpl<unsigned> &ShuffleMask);
+/// DecodeUNPCKHPMask - This decodes the shuffle masks for unpckhps/unpckhpd
+/// etc.  VT indicates the type of the vector allowing it to handle different
+/// datatypes and vector widths.
+void DecodeUNPCKHPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
 
 /// DecodeUNPCKLPMask - This decodes the shuffle masks for unpcklps/unpcklpd
 /// etc.  VT indicates the type of the vector allowing it to handle different
 /// datatypes and vector widths.
-void DecodeUNPCKLPMask(EVT VT,
-                       SmallVectorImpl<unsigned> &ShuffleMask);
+void DecodeUNPCKLPMask(EVT VT, SmallVectorImpl<unsigned> &ShuffleMask);
 
 
 // DecodeVPERMILPSMask - Decodes VPERMILPS permutes for any 128-bit 32-bit
diff --git a/lib/Target/X86/X86ISelDAGToDAG.cpp b/lib/Target/X86/X86ISelDAGToDAG.cpp
index 3d75de0..3c35763 100644
--- a/lib/Target/X86/X86ISelDAGToDAG.cpp
+++ b/lib/Target/X86/X86ISelDAGToDAG.cpp
@@ -2216,6 +2216,75 @@ SDNode *X86DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case ISD::STORE: {
+    // The DEC64m tablegen pattern is currently not able to match the case where
+    // the EFLAGS on the original DEC are used.
+    // we'll need to improve tablegen to allow flags to be transferred from a
+    // node in the pattern to the result node.  probably with a new keyword
+    // for example, we have this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (implicit EFLAGS)]>;
+    // but maybe need something like this
+    // def DEC64m : RI<0xFF, MRM1m, (outs), (ins i64mem:$dst), "dec{q}\t$dst",
+    //  [(store (add (loadi64 addr:$dst), -1), addr:$dst),
+    //   (transferrable EFLAGS)]>;
+    StoreSDNode *StoreNode = cast<StoreSDNode>(Node);
+    SDValue Chain = StoreNode->getOperand(0);
+    SDValue StoredVal = StoreNode->getOperand(1);
+    SDValue Address = StoreNode->getOperand(2);
+    SDValue Undef = StoreNode->getOperand(3);
+
+    if (StoreNode->getMemOperand()->getSize() != 8 ||
+        Undef->getOpcode() != ISD::UNDEF ||
+        Chain->getOpcode() != ISD::LOAD ||
+        StoredVal->getOpcode() != X86ISD::DEC ||
+        StoredVal.getResNo() != 0 ||
+        StoredVal->getOperand(0).getNode() != Chain.getNode())
+      break;
+
+    //OPC_CheckPredicate, 1, // Predicate_nontemporalstore
+    if (StoreNode->isNonTemporal())
+      break;
+
+    LoadSDNode *LoadNode = cast<LoadSDNode>(Chain.getNode());
+    if (LoadNode->getOperand(1) != Address ||
+        LoadNode->getOperand(2) != Undef)
+      break;
+
+    if (!ISD::isNormalLoad(LoadNode))
+      break;
+
+    if (!ISD::isNormalStore(StoreNode))
+      break;
+
+    // check load chain has only one use (from the store)
+    if (!Chain.hasOneUse())
+      break;
+
+    // Merge the input chains if they are not intra-pattern references.
+    SDValue InputChain = LoadNode->getOperand(0);
+
+    SDValue Base, Scale, Index, Disp, Segment;
+    if (!SelectAddr(LoadNode, LoadNode->getBasePtr(),
+                    Base, Scale, Index, Disp, Segment))
+      break;
+
+    MachineSDNode::mmo_iterator MemOp = MF->allocateMemRefsArray(2);
+    MemOp[0] = StoreNode->getMemOperand();
+    MemOp[1] = LoadNode->getMemOperand();
+    const SDValue Ops[] = { Base, Scale, Index, Disp, Segment, InputChain };
+    MachineSDNode *Result = CurDAG->getMachineNode(X86::DEC64m,
+                                                   Node->getDebugLoc(),
+                                                   MVT::i32, MVT::Other, Ops,
+                                                   array_lengthof(Ops));
+    Result->setMemRefs(MemOp, MemOp + 2);
+
+    ReplaceUses(SDValue(StoreNode, 0), SDValue(Result, 1));
+    ReplaceUses(SDValue(StoredVal.getNode(), 1), SDValue(Result, 0));
+
+    return Result;
+  }
   }
 
   SDNode *ResNode = SelectCode(Node);
diff --git a/lib/Target/X86/X86ISelLowering.cpp b/lib/Target/X86/X86ISelLowering.cpp
index 4e11131..96c6f41 100644
--- a/lib/Target/X86/X86ISelLowering.cpp
+++ b/lib/Target/X86/X86ISelLowering.cpp
@@ -35,7 +35,6 @@
 #include "llvm/CodeGen/MachineJumpTableInfo.h"
 #include "llvm/CodeGen/MachineModuleInfo.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCExpr.h"
@@ -909,7 +908,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     setOperationAction(ISD::SINT_TO_FP,         MVT::v4i32, Legal);
   }
 
-  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSE41orAVX()) {
     setOperationAction(ISD::FFLOOR,             MVT::f32,   Legal);
     setOperationAction(ISD::FCEIL,              MVT::f32,   Legal);
     setOperationAction(ISD::FTRUNC,             MVT::f32,   Legal);
@@ -981,7 +980,7 @@ X86TargetLowering::X86TargetLowering(X86TargetMachine &TM)
     }
   }
 
-  if (Subtarget->hasSSE42() || Subtarget->hasAVX())
+  if (Subtarget->hasSSE42orAVX())
     setOperationAction(ISD::SETCC,             MVT::v2i64, Custom);
 
   if (!UseSoftFloat && Subtarget->hasAVX()) {
@@ -2846,16 +2845,12 @@ static bool isTargetShuffle(unsigned Opcode) {
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPSY:
-  case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLDQ:
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
-  case X86ISD::VUNPCKHPSY:
-  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKHWD:
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
@@ -2927,16 +2922,12 @@ static SDValue getTargetShuffleNode(unsigned Opc, DebugLoc dl, EVT VT,
   case X86ISD::MOVSD:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPSY:
-  case X86ISD::VUNPCKLPDY:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLDQ:
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
-  case X86ISD::VUNPCKHPSY:
-  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKHWD:
   case X86ISD::PUNPCKHBW:
   case X86ISD::PUNPCKHDQ:
@@ -3416,6 +3407,41 @@ static unsigned getShuffleVSHUFPDYImmediate(SDNode *N) {
   return Mask;
 }
 
+/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
+/// the two vector operands have swapped position.
+static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
+  unsigned NumElems = VT.getVectorNumElements();
+  for (unsigned i = 0; i != NumElems; ++i) {
+    int idx = Mask[i];
+    if (idx < 0)
+      continue;
+    else if (idx < (int)NumElems)
+      Mask[i] = idx + NumElems;
+    else
+      Mask[i] = idx - NumElems;
+  }
+}
+
+/// isCommutedVSHUFP() - Return true if swapping operands will 
+///  allow to use the "vshufpd" or "vshufps" instruction 
+///  for 256-bit vectors
+static bool isCommutedVSHUFPMask(const SmallVectorImpl<int> &Mask, EVT VT,
+                               const X86Subtarget *Subtarget) {
+
+  unsigned NumElems = VT.getVectorNumElements();
+  if ((VT.getSizeInBits() != 256) || ((NumElems != 4) && (NumElems != 8)))
+    return false;
+
+  SmallVector<int, 8> CommutedMask;
+  for (unsigned i = 0; i < NumElems; ++i)
+    CommutedMask.push_back(Mask[i]);
+
+  CommuteVectorShuffleMask(CommutedMask, VT);
+  return (NumElems == 4) ? isVSHUFPDYMask(CommutedMask, VT, Subtarget):
+      isVSHUFPSYMask(CommutedMask, VT, Subtarget);
+}
+
+
 /// isSHUFPMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to 128-bit
 /// SHUFPS and SHUFPD.
@@ -3551,13 +3577,14 @@ bool X86::isMOVLHPSMask(ShuffleVectorSDNode *N) {
 /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKL.
 static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                         bool V2IsSplat = false) {
+                         bool HasAVX2, bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
     return false;
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3591,22 +3618,23 @@ static bool isUNPCKLMask(const SmallVectorImpl<int> &Mask, EVT VT,
   return true;
 }
 
-bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+bool X86::isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
   SmallVector<int, 8> M;
   N->getMask(M);
-  return ::isUNPCKLMask(M, N->getValueType(0), V2IsSplat);
+  return ::isUNPCKLMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
 }
 
 /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
 /// specifies a shuffle of elements that is suitable for input to UNPCKH.
 static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
-                         bool V2IsSplat = false) {
+                         bool HasAVX2, bool V2IsSplat = false) {
   int NumElts = VT.getVectorNumElements();
 
   assert((VT.is128BitVector() || VT.is256BitVector()) &&
          "Unsupported vector type for unpckh");
 
-  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8)
+  if (VT.getSizeInBits() == 256 && NumElts != 4 && NumElts != 8 &&
+      (!HasAVX2 || (NumElts != 16 && NumElts != 32)))
     return false;
 
   // Handle 128 and 256-bit vector lengths. AVX defines UNPCK* to operate
@@ -3638,10 +3666,10 @@ static bool isUNPCKHMask(const SmallVectorImpl<int> &Mask, EVT VT,
   return true;
 }
 
-bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat) {
+bool X86::isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2, bool V2IsSplat) {
   SmallVector<int, 8> M;
   N->getMask(M);
-  return ::isUNPCKHMask(M, N->getValueType(0), V2IsSplat);
+  return ::isUNPCKHMask(M, N->getValueType(0), HasAVX2, V2IsSplat);
 }
 
 /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
@@ -3953,7 +3981,7 @@ static bool isCommutedMOVL(ShuffleVectorSDNode *N, bool V2IsSplat = false,
 /// Masks to match: <1, 1, 3, 3> or <1, 1, 3, 3, 5, 5, 7, 7>
 bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
                          const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+  if (!Subtarget->hasSSE3orAVX())
     return false;
 
   // The second vector must be undef
@@ -3981,7 +4009,7 @@ bool X86::isMOVSHDUPMask(ShuffleVectorSDNode *N,
 /// Masks to match: <0, 0, 2, 2> or <0, 0, 2, 2, 4, 4, 6, 6>
 bool X86::isMOVSLDUPMask(ShuffleVectorSDNode *N,
                          const X86Subtarget *Subtarget) {
-  if (!Subtarget->hasSSE3() && !Subtarget->hasAVX())
+  if (!Subtarget->hasSSE3orAVX())
     return false;
 
   // The second vector must be undef
@@ -4216,21 +4244,6 @@ static SDValue CommuteVectorShuffle(ShuffleVectorSDNode *SVOp,
                               SVOp->getOperand(0), &MaskVec[0]);
 }
 
-/// CommuteVectorShuffleMask - Change values in a shuffle permute mask assuming
-/// the two vector operands have swapped position.
-static void CommuteVectorShuffleMask(SmallVectorImpl<int> &Mask, EVT VT) {
-  unsigned NumElems = VT.getVectorNumElements();
-  for (unsigned i = 0; i != NumElems; ++i) {
-    int idx = Mask[i];
-    if (idx < 0)
-      continue;
-    else if (idx < (int)NumElems)
-      Mask[i] = idx + NumElems;
-    else
-      Mask[i] = idx - NumElems;
-  }
-}
-
 /// ShouldXformToMOVHLPS - Return true if the node should be transformed to
 /// match movhlps. The lower half elements should come from upper half of
 /// V1 (and in order), and the upper half elements should come from the upper
@@ -4388,23 +4401,30 @@ static SDValue getZeroVector(EVT VT, bool HasXMMInt, SelectionDAG &DAG,
 }
 
 /// getOnesVector - Returns a vector of specified type with all bits set.
-/// Always build ones vectors as <4 x i32>. For 256-bit types, use two
-/// <4 x i32> inserted in a <8 x i32> appropriately. Then bitcast to their
-/// original type, ensuring they get CSE'd.
-static SDValue getOnesVector(EVT VT, SelectionDAG &DAG, DebugLoc dl) {
+/// Always build ones vectors as <4 x i32> or <8 x i32>. For 256-bit types with
+/// no AVX2 supprt, use two <4 x i32> inserted in a <8 x i32> appropriately.
+/// Then bitcast to their original type, ensuring they get CSE'd.
+static SDValue getOnesVector(EVT VT, bool HasAVX2, SelectionDAG &DAG,
+                             DebugLoc dl) {
   assert(VT.isVector() && "Expected a vector type");
   assert((VT.is128BitVector() || VT.is256BitVector())
          && "Expected a 128-bit or 256-bit vector type");
 
   SDValue Cst = DAG.getTargetConstant(~0U, MVT::i32);
-  SDValue Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32,
-                            Cst, Cst, Cst, Cst);
-
-  if (VT.is256BitVector()) {
-    SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
-                              Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
-    Vec = Insert128BitVector(InsV, Vec,
-                  DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+  SDValue Vec;
+  if (VT.getSizeInBits() == 256) {
+    if (HasAVX2) { // AVX2
+      SDValue Ops[] = { Cst, Cst, Cst, Cst, Cst, Cst, Cst, Cst };
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v8i32, Ops, 8);
+    } else { // AVX
+      Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
+      SDValue InsV = Insert128BitVector(DAG.getNode(ISD::UNDEF, dl, MVT::v8i32),
+                                Vec, DAG.getConstant(0, MVT::i32), DAG, dl);
+      Vec = Insert128BitVector(InsV, Vec,
+                    DAG.getConstant(4 /* NumElems/2 */, MVT::i32), DAG, dl);
+    }
+  } else {
+    Vec = DAG.getNode(ISD::BUILD_VECTOR, dl, MVT::v4i32, Cst, Cst, Cst, Cst);
   }
 
   return DAG.getNode(ISD::BITCAST, dl, VT, Vec);
@@ -4623,9 +4643,7 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       break;
     case X86ISD::UNPCKHPS:
     case X86ISD::UNPCKHPD:
-    case X86ISD::VUNPCKHPSY:
-    case X86ISD::VUNPCKHPDY:
-      DecodeUNPCKHPMask(NumElems, ShuffleMask);
+      DecodeUNPCKHPMask(VT, ShuffleMask);
       break;
     case X86ISD::PUNPCKLBW:
     case X86ISD::PUNPCKLWD:
@@ -4635,8 +4653,6 @@ static SDValue getShuffleScalarElt(SDNode *N, int Index, SelectionDAG &DAG,
       break;
     case X86ISD::UNPCKLPS:
     case X86ISD::UNPCKLPD:
-    case X86ISD::VUNPCKLPSY:
-    case X86ISD::VUNPCKLPDY:
       DecodeUNPCKLPMask(VT, ShuffleMask);
       break;
     case X86ISD::MOVHLPS:
@@ -5111,6 +5127,97 @@ static SDValue EltsFromConsecutiveLoads(EVT VT, SmallVectorImpl<SDValue> &Elts,
   return SDValue();
 }
 
+/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
+/// a vbroadcast node. We support two patterns:
+/// 1. A splat BUILD_VECTOR which uses a single scalar load.
+/// 2. A splat shuffle which uses a scalar_to_vector node which comes from
+/// a scalar load.
+/// The scalar load node is returned when a pattern is found,
+/// or SDValue() otherwise.
+static SDValue isVectorBroadcast(SDValue &Op, bool hasAVX2) {
+  EVT VT = Op.getValueType();
+  SDValue V = Op;
+
+  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
+    V = V.getOperand(0);
+
+  //A suspected load to be broadcasted.
+  SDValue Ld;
+
+  switch (V.getOpcode()) {
+    default:
+      // Unknown pattern found.
+      return SDValue();
+
+    case ISD::BUILD_VECTOR: {
+      // The BUILD_VECTOR node must be a splat.
+      if (!isSplatVector(V.getNode()))
+        return SDValue();
+
+      Ld = V.getOperand(0);
+
+      // The suspected load node has several users. Make sure that all
+      // of its users are from the BUILD_VECTOR node.
+      if (!Ld->hasNUsesOfValue(VT.getVectorNumElements(), 0))
+        return SDValue();
+      break;
+    }
+
+    case ISD::VECTOR_SHUFFLE: {
+      ShuffleVectorSDNode *SVOp = cast<ShuffleVectorSDNode>(Op);
+
+      // Shuffles must have a splat mask where the first element is
+      // broadcasted.
+      if ((!SVOp->isSplat()) || SVOp->getMaskElt(0) != 0)
+        return SDValue();
+
+      SDValue Sc = Op.getOperand(0);
+      if (Sc.getOpcode() != ISD::SCALAR_TO_VECTOR)
+        return SDValue();
+
+      Ld = Sc.getOperand(0);
+
+      // The scalar_to_vector node and the suspected
+      // load node must have exactly one user.
+      if (!Sc.hasOneUse() || !Ld.hasOneUse())
+        return SDValue();
+      break;
+    }
+  }
+
+  // The scalar source must be a normal load.
+  if (!ISD::isNormalLoad(Ld.getNode()))
+    return SDValue();
+
+  bool Is256 = VT.getSizeInBits() == 256;
+  bool Is128 = VT.getSizeInBits() == 128;
+  unsigned ScalarSize = Ld.getValueType().getSizeInBits();
+
+  if (hasAVX2) {
+    // VBroadcast to YMM
+    if (Is256 && (ScalarSize == 8  || ScalarSize == 16 ||
+                  ScalarSize == 32 || ScalarSize == 64 ))
+      return Ld;
+
+    // VBroadcast to XMM
+    if (Is128 && (ScalarSize ==  8 || ScalarSize == 32 ||
+                  ScalarSize == 16 || ScalarSize == 64 ))
+      return Ld;
+  }
+
+  // VBroadcast to YMM
+  if (Is256 && (ScalarSize == 32 || ScalarSize == 64))
+    return Ld;
+
+  // VBroadcast to XMM
+  if (Is128 && (ScalarSize == 32))
+    return Ld;
+
+
+  // Unsupported broadcast.
+  return SDValue();
+}
+
 SDValue
 X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   DebugLoc dl = Op.getDebugLoc();
@@ -5131,14 +5238,20 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // Vectors containing all ones can be matched by pcmpeqd on 128-bit width
-  // vectors or broken into v4i32 operations on 256-bit vectors.
+  // vectors or broken into v4i32 operations on 256-bit vectors. AVX2 can use
+  // vpcmpeqd on 256-bit vectors.
   if (ISD::isBuildVectorAllOnes(Op.getNode())) {
-    if (Op.getValueType() == MVT::v4i32)
+    if (Op.getValueType() == MVT::v4i32 ||
+        (Op.getValueType() == MVT::v8i32 && Subtarget->hasAVX2()))
       return Op;
 
-    return getOnesVector(Op.getValueType(), DAG, dl);
+    return getOnesVector(Op.getValueType(), Subtarget->hasAVX2(), DAG, dl);
   }
 
+  SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
+  if (Subtarget->hasAVX() && LD.getNode())
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
+
   unsigned EVTBits = ExtVT.getSizeInBits();
 
   unsigned NumZero  = 0;
@@ -5380,7 +5493,7 @@ X86TargetLowering::LowerBUILD_VECTOR(SDValue Op, SelectionDAG &DAG) const {
       return LD;
 
     // For SSE 4.1, use insertps to put the high elements into the low element.
-    if (getSubtarget()->hasSSE41() || getSubtarget()->hasAVX()) {
+    if (getSubtarget()->hasSSE41orAVX()) {
       SDValue Result;
       if (Op.getOperand(0).getOpcode() != ISD::UNDEF)
         Result = DAG.getNode(ISD::SCALAR_TO_VECTOR, dl, VT, Op.getOperand(0));
@@ -5551,7 +5664,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // quads, disable the next transformation since it does not help SSSE3.
   bool V1Used = InputQuads[0] || InputQuads[1];
   bool V2Used = InputQuads[2] || InputQuads[3];
-  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSSE3orAVX()) {
     if (InputQuads.count() == 2 && V1Used && V2Used) {
       BestLoQuad = InputQuads.find_first();
       BestHiQuad = InputQuads.find_next(BestLoQuad);
@@ -5624,7 +5737,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
   // If we have SSSE3, and all words of the result are from 1 input vector,
   // case 2 is generated, otherwise case 3 is generated.  If no SSSE3
   // is present, fall back to case 4.
-  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSSE3orAVX()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If we have elements from both input vectors, set the high bit of the
@@ -5692,8 +5805,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
-        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
       NewV = getTargetShuffleNode(X86ISD::PSHUFLW, dl, MVT::v8i16,
                                NewV.getOperand(0),
                                X86::getShufflePSHUFLWImmediate(NewV.getNode()),
@@ -5721,8 +5833,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLEv8i16(SDValue Op,
     NewV = DAG.getVectorShuffle(MVT::v8i16, dl, NewV, DAG.getUNDEF(MVT::v8i16),
                                 &MaskV[0]);
 
-    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE &&
-        (Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+    if (NewV.getOpcode() == ISD::VECTOR_SHUFFLE && Subtarget->hasSSSE3orAVX())
       NewV = getTargetShuffleNode(X86ISD::PSHUFHW, dl, MVT::v8i16,
                               NewV.getOperand(0),
                               X86::getShufflePSHUFHWImmediate(NewV.getNode()),
@@ -5788,7 +5899,7 @@ SDValue LowerVECTOR_SHUFFLEv16i8(ShuffleVectorSDNode *SVOp,
   }
 
   // If SSSE3, use 1 pshufb instruction per vector with elements in the result.
-  if (TLI.getSubtarget()->hasSSSE3() || TLI.getSubtarget()->hasAVX()) {
+  if (TLI.getSubtarget()->hasSSSE3orAVX()) {
     SmallVector<SDValue,16> pshufbMask;
 
     // If all result elements are from one input vector, then only translate
@@ -6455,17 +6566,23 @@ SDValue getMOVLP(SDValue &Op, DebugLoc &dl, SelectionDAG &DAG, bool HasXMMInt) {
                               X86::getShuffleSHUFImmediate(SVOp), DAG);
 }
 
-static inline unsigned getUNPCKLOpcode(EVT VT) {
+static inline unsigned getUNPCKLOpcode(EVT VT, bool HasAVX2) {
   switch(VT.getSimpleVT().SimpleTy) {
   case MVT::v4i32: return X86ISD::PUNPCKLDQ;
   case MVT::v2i64: return X86ISD::PUNPCKLQDQ;
+  case MVT::v8i32:
+    if (HasAVX2)   return X86ISD::PUNPCKLDQ;
+    // else use fp unit for int unpack.
+  case MVT::v8f32:
   case MVT::v4f32: return X86ISD::UNPCKLPS;
+  case MVT::v4i64:
+    if (HasAVX2)   return X86ISD::PUNPCKLQDQ;
+    // else use fp unit for int unpack.
+  case MVT::v4f64:
   case MVT::v2f64: return X86ISD::UNPCKLPD;
-  case MVT::v8i32: // Use fp unit for int unpack.
-  case MVT::v8f32: return X86ISD::VUNPCKLPSY;
-  case MVT::v4i64: // Use fp unit for int unpack.
-  case MVT::v4f64: return X86ISD::VUNPCKLPDY;
+  case MVT::v32i8:
   case MVT::v16i8: return X86ISD::PUNPCKLBW;
+  case MVT::v16i16:
   case MVT::v8i16: return X86ISD::PUNPCKLWD;
   default:
     llvm_unreachable("Unknown type for unpckl");
@@ -6473,17 +6590,23 @@ static inline unsigned getUNPCKLOpcode(EVT VT) {
   return 0;
 }
 
-static inline unsigned getUNPCKHOpcode(EVT VT) {
+static inline unsigned getUNPCKHOpcode(EVT VT, bool HasAVX2) {
   switch(VT.getSimpleVT().SimpleTy) {
   case MVT::v4i32: return X86ISD::PUNPCKHDQ;
   case MVT::v2i64: return X86ISD::PUNPCKHQDQ;
+  case MVT::v8i32:
+    if (HasAVX2)   return X86ISD::PUNPCKHDQ;
+    // else use fp unit for int unpack.
+  case MVT::v8f32:
   case MVT::v4f32: return X86ISD::UNPCKHPS;
+  case MVT::v4i64:
+    if (HasAVX2)   return X86ISD::PUNPCKHQDQ;
+    // else use fp unit for int unpack.
+  case MVT::v4f64:
   case MVT::v2f64: return X86ISD::UNPCKHPD;
-  case MVT::v8i32: // Use fp unit for int unpack.
-  case MVT::v8f32: return X86ISD::VUNPCKHPSY;
-  case MVT::v4i64: // Use fp unit for int unpack.
-  case MVT::v4f64: return X86ISD::VUNPCKHPDY;
+  case MVT::v32i8:
   case MVT::v16i8: return X86ISD::PUNPCKHBW;
+  case MVT::v16i16:
   case MVT::v8i16: return X86ISD::PUNPCKHWD;
   default:
     llvm_unreachable("Unknown type for unpckh");
@@ -6507,52 +6630,6 @@ static inline unsigned getVPERMILOpcode(EVT VT) {
   return 0;
 }
 
-/// isVectorBroadcast - Check if the node chain is suitable to be xformed to
-/// a vbroadcast node. The nodes are suitable whenever we can fold a load coming
-/// from a 32 or 64 bit scalar. Update Op to the desired load to be folded.
-static bool isVectorBroadcast(SDValue &Op) {
-  EVT VT = Op.getValueType();
-  bool Is256 = VT.getSizeInBits() == 256;
-
-  assert((VT.getSizeInBits() == 128 || Is256) &&
-         "Unsupported type for vbroadcast node");
-
-  SDValue V = Op;
-  if (V.hasOneUse() && V.getOpcode() == ISD::BITCAST)
-    V = V.getOperand(0);
-
-  if (Is256 && !(V.hasOneUse() &&
-                 V.getOpcode() == ISD::INSERT_SUBVECTOR &&
-                 V.getOperand(0).getOpcode() == ISD::UNDEF))
-    return false;
-
-  if (Is256)
-    V = V.getOperand(1);
-
-  if (!V.hasOneUse())
-    return false;
-
-  // Check the source scalar_to_vector type. 256-bit broadcasts are
-  // supported for 32/64-bit sizes, while 128-bit ones are only supported
-  // for 32-bit scalars.
-  if (V.getOpcode() != ISD::SCALAR_TO_VECTOR)
-    return false;
-
-  unsigned ScalarSize = V.getOperand(0).getValueType().getSizeInBits();
-  if (ScalarSize != 32 && ScalarSize != 64)
-    return false;
-  if (!Is256 && ScalarSize == 64)
-    return false;
-
-  V = V.getOperand(0);
-  if (!MayFoldLoad(V))
-    return false;
-
-  // Return the load node
-  Op = V;
-  return true;
-}
-
 static
 SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
                                const TargetLowering &TLI,
@@ -6578,8 +6655,9 @@ SDValue NormalizeVectorShuffle(SDValue Op, SelectionDAG &DAG,
       return Op;
 
     // Use vbroadcast whenever the splat comes from a foldable load
-    if (Subtarget->hasAVX() && isVectorBroadcast(V1))
-      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, V1);
+    SDValue LD = isVectorBroadcast(Op, Subtarget->hasAVX2());
+    if (Subtarget->hasAVX() && LD.getNode())
+      return DAG.getNode(X86ISD::VBROADCAST, dl, VT, LD);
 
     // Handle splats by matching through known shuffle masks
     if ((Size == 128 && NumElem <= 4) ||
@@ -6630,6 +6708,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool V1IsSplat = false;
   bool V2IsSplat = false;
   bool HasXMMInt = Subtarget->hasXMMInt();
+  bool HasAVX2   = Subtarget->hasAVX2();
   MachineFunction &MF = DAG.getMachineFunction();
   bool OptForSize = MF.getFunction()->hasFnAttr(Attribute::OptimizeForSize);
 
@@ -6659,12 +6738,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   // NOTE: isPSHUFDMask can also match both masks below (unpckl_undef and
   // unpckh_undef). Only use pshufd if speed is more important than size.
   if (OptForSize && X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1,
+                                DAG);
   if (OptForSize && X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
+                                DAG);
 
-  if (X86::isMOVDDUPMask(SVOp) &&
-      (Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
+  if (X86::isMOVDDUPMask(SVOp) && Subtarget->hasSSE3orAVX() &&
       V2IsUndef && RelaxedMayFoldVectorLoad(V1))
     return getMOVDDup(Op, dl, V1, DAG);
 
@@ -6672,9 +6752,10 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getMOVHighToLow(Op, dl, DAG);
 
   // Use to match splats
-  if (HasXMMInt && X86::isUNPCKHMask(SVOp) && V2IsUndef &&
+  if (HasXMMInt && X86::isUNPCKHMask(SVOp, HasAVX2) && V2IsUndef &&
       (VT == MVT::v2f64 || VT == MVT::v2i64))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
+                                DAG);
 
   if (X86::isPSHUFDMask(SVOp)) {
     // The actual implementation will match the mask in the if above and then
@@ -6696,8 +6777,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   bool isLeft = false;
   unsigned ShAmt = 0;
   SDValue ShVal;
-  bool isShift = getSubtarget()->hasXMMInt() &&
-                 isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
+  bool isShift = HasXMMInt && isVectorShift(SVOp, DAG, isLeft, ShVal, ShAmt);
   if (isShift && ShVal.hasOneUse()) {
     // If the shifted value has multiple uses, it may be cheaper to use
     // v_set0 + movlhps or movhlps, etc.
@@ -6721,7 +6801,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   }
 
   // FIXME: fold these into legal mask.
-  if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp))
+  if (X86::isMOVLHPSMask(SVOp) && !X86::isUNPCKLMask(SVOp, HasAVX2))
     return getMOVLowToHigh(Op, dl, DAG, HasXMMInt);
 
   if (X86::isMOVHLPSMask(SVOp))
@@ -6774,11 +6854,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getMOVL(DAG, dl, VT, V2, V1);
   }
 
-  if (X86::isUNPCKLMask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V2, DAG);
+  if (X86::isUNPCKLMask(SVOp, HasAVX2))
+    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V2,
+                                DAG);
 
-  if (X86::isUNPCKHMask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V2, DAG);
+  if (X86::isUNPCKHMask(SVOp, HasAVX2))
+    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V2,
+                                DAG);
 
   if (V2IsSplat) {
     // Normalize mask so all entries that point to V2 points to its first
@@ -6787,9 +6869,9 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     SDValue NewMask = NormalizeMask(SVOp, DAG);
     ShuffleVectorSDNode *NSVOp = cast<ShuffleVectorSDNode>(NewMask);
     if (NSVOp != SVOp) {
-      if (X86::isUNPCKLMask(NSVOp, true)) {
+      if (X86::isUNPCKLMask(NSVOp, HasAVX2, true)) {
         return NewMask;
-      } else if (X86::isUNPCKHMask(NSVOp, true)) {
+      } else if (X86::isUNPCKHMask(NSVOp, HasAVX2, true)) {
         return NewMask;
       }
     }
@@ -6801,11 +6883,13 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     SDValue NewOp = CommuteVectorShuffle(SVOp, DAG);
     ShuffleVectorSDNode *NewSVOp = cast<ShuffleVectorSDNode>(NewOp);
 
-    if (X86::isUNPCKLMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V2, V1, DAG);
+    if (X86::isUNPCKLMask(NewSVOp, HasAVX2))
+      return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V2, V1,
+                                  DAG);
 
-    if (X86::isUNPCKHMask(NewSVOp))
-      return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V2, V1, DAG);
+    if (X86::isUNPCKHMask(NewSVOp, HasAVX2))
+      return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V2, V1,
+                                  DAG);
   }
 
   // Normalize the node to match x86 shuffle ops if needed
@@ -6818,7 +6902,7 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
   SmallVector<int, 16> M;
   SVOp->getMask(M);
 
-  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()))
+  if (isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()))
     return getTargetShuffleNode(X86ISD::PALIGN, dl, VT, V1, V2,
                                 X86::getShufflePALIGNRImmediate(SVOp),
                                 DAG);
@@ -6846,9 +6930,11 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
                                 X86::getShuffleSHUFImmediate(SVOp), DAG);
 
   if (X86::isUNPCKL_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKLOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKLOpcode(VT, HasAVX2), dl, VT, V1, V1,
+                                DAG);
   if (X86::isUNPCKH_v_undef_Mask(SVOp))
-    return getTargetShuffleNode(getUNPCKHOpcode(VT), dl, VT, V1, V1, DAG);
+    return getTargetShuffleNode(getUNPCKHOpcode(VT, HasAVX2), dl, VT, V1, V1,
+                                DAG);
 
   //===--------------------------------------------------------------------===//
   // Generate target specific nodes for 128 or 256-bit shuffles only
@@ -6884,6 +6970,17 @@ X86TargetLowering::LowerVECTOR_SHUFFLE(SDValue Op, SelectionDAG &DAG) const {
     return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2,
                                 getShuffleVSHUFPDYImmediate(SVOp), DAG);
 
+  // Try to swap operands in the node to match x86 shuffle ops
+  if (isCommutedVSHUFPMask(M, VT, Subtarget)) {
+    // Now we need to commute operands.
+    SVOp = cast<ShuffleVectorSDNode>(CommuteVectorShuffle(SVOp, DAG));
+    V1 = SVOp->getOperand(0);
+    V2 = SVOp->getOperand(1);
+    unsigned Immediate = (NumElems == 4) ? getShuffleVSHUFPDYImmediate(SVOp):
+        getShuffleVSHUFPSYImmediate(SVOp);
+    return getTargetShuffleNode(getSHUFPOpcode(VT), dl, VT, V1, V2, Immediate, DAG);
+  }
+
   //===--------------------------------------------------------------------===//
   // Since no target specific shuffle was selected for this generic one,
   // lower it into other known shuffles. FIXME: this isn't true yet, but
@@ -7002,7 +7099,7 @@ X86TargetLowering::LowerEXTRACT_VECTOR_ELT(SDValue Op,
 
   assert(Vec.getValueSizeInBits() <= 128 && "Unexpected vector length");
 
-  if (Subtarget->hasSSE41() || Subtarget->hasAVX()) {
+  if (Subtarget->hasSSE41orAVX()) {
     SDValue Res = LowerEXTRACT_VECTOR_ELT_SSE4(Op, DAG);
     if (Res.getNode())
       return Res;
@@ -7144,7 +7241,7 @@ X86TargetLowering::LowerINSERT_VECTOR_ELT(SDValue Op, SelectionDAG &DAG) const {
     return Insert128BitVector(N0, V, Ins128Idx, DAG, dl);
   }
 
-  if (Subtarget->hasSSE41() || Subtarget->hasAVX())
+  if (Subtarget->hasSSE41orAVX())
     return LowerINSERT_VECTOR_ELT_SSE4(Op, DAG);
 
   if (EltVT == MVT::i8)
@@ -8264,8 +8361,10 @@ SDValue X86TargetLowering::EmitTest(SDValue Op, unsigned X86CC,
     // climbing the DAG back to the root, and it doesn't seem to be worth the
     // effort.
     for (SDNode::use_iterator UI = Op.getNode()->use_begin(),
-           UE = Op.getNode()->use_end(); UI != UE; ++UI)
-      if (UI->getOpcode() != ISD::CopyToReg && UI->getOpcode() != ISD::SETCC)
+         UE = Op.getNode()->use_end(); UI != UE; ++UI)
+      if (UI->getOpcode() != ISD::CopyToReg &&
+          UI->getOpcode() != ISD::SETCC &&
+          UI->getOpcode() != ISD::STORE)
         goto default_case;
 
     if (ConstantSDNode *C =
@@ -8408,11 +8507,19 @@ SDValue X86TargetLowering::LowerToBT(SDValue And, ISD::CondCode CC,
       }
   } else if (Op1.getOpcode() == ISD::Constant) {
     ConstantSDNode *AndRHS = cast<ConstantSDNode>(Op1);
+    uint64_t AndRHSVal = AndRHS->getZExtValue();
     SDValue AndLHS = Op0;
-    if (AndRHS->getZExtValue() == 1 && AndLHS.getOpcode() == ISD::SRL) {
+
+    if (AndRHSVal == 1 && AndLHS.getOpcode() == ISD::SRL) {
       LHS = AndLHS.getOperand(0);
       RHS = AndLHS.getOperand(1);
     }
+
+    // Use BT if the immediate can't be encoded in a TEST instruction.
+    if (!isUInt<32>(AndRHSVal) && isPowerOf2_64(AndRHSVal)) {
+      LHS = AndLHS;
+      RHS = DAG.getConstant(Log2_64_Ceil(AndRHSVal), LHS.getValueType());
+    }
   }
 
   if (LHS.getNode()) {
@@ -8632,9 +8739,9 @@ SDValue X86TargetLowering::LowerVSETCC(SDValue Op, SelectionDAG &DAG) const {
 
   // Check that the operation in question is available (most are plain SSE2,
   // but PCMPGTQ and PCMPEQQ have different requirements).
-  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42() && !Subtarget->hasAVX())
+  if (Opc == X86ISD::PCMPGTQ && !Subtarget->hasSSE42orAVX())
     return SDValue();
-  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41() && !Subtarget->hasAVX())
+  if (Opc == X86ISD::PCMPEQQ && !Subtarget->hasSSE41orAVX())
     return SDValue();
 
   // Since SSE has no unsigned integer comparisons, we need to flip  the sign
@@ -9464,6 +9571,23 @@ X86TargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op, SelectionDAG &DAG) const
   case Intrinsic::x86_avx_hsub_pd_256:
     return DAG.getNode(X86ISD::FHSUB, dl, Op.getValueType(),
                        Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psllv_d:
+  case Intrinsic::x86_avx2_psllv_q:
+  case Intrinsic::x86_avx2_psllv_d_256:
+  case Intrinsic::x86_avx2_psllv_q_256:
+    return DAG.getNode(ISD::SHL, dl, Op.getValueType(),
+                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrlv_d:
+  case Intrinsic::x86_avx2_psrlv_q:
+  case Intrinsic::x86_avx2_psrlv_d_256:
+  case Intrinsic::x86_avx2_psrlv_q_256:
+    return DAG.getNode(ISD::SRL, dl, Op.getValueType(),
+                      Op.getOperand(1), Op.getOperand(2));
+  case Intrinsic::x86_avx2_psrav_d:
+  case Intrinsic::x86_avx2_psrav_d_256:
+    return DAG.getNode(ISD::SRA, dl, Op.getValueType(),
+                      Op.getOperand(1), Op.getOperand(2));
+
   // ptest and testp intrinsics. The intrinsic these come from are designed to
   // return an integer value, not just an instruction so lower it to the ptest
   // or testp pattern and a setcc for the result.
@@ -10261,47 +10385,48 @@ SDValue X86TargetLowering::LowerShift(SDValue Op, SelectionDAG &DAG) const {
         return Res;
       }
 
-      if (Subtarget->hasAVX2()) {
-        if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SHL)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_pslli_q, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SHL)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_pslli_d, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SHL)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v4i64 && Op.getOpcode() == ISD::SRL)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_psrli_q, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRL)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_psrli_d, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRL)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v8i32 && Op.getOpcode() == ISD::SRA)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_psrai_d, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
-
-        if (VT == MVT::v16i16 && Op.getOpcode() == ISD::SRA)
-         return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
-                       DAG.getConstant(Intrinsic::x86_avx2_psrai_w, MVT::i32),
-                       R, DAG.getConstant(ShiftAmt, MVT::i32));
+      if (Subtarget->hasAVX2() && VT == MVT::v32i8) {
+        if (Op.getOpcode() == ISD::SHL) {
+          // Make a large shift.
+          SDValue SHL =
+            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                        DAG.getConstant(Intrinsic::x86_avx2_pslli_w, MVT::i32),
+                        R, DAG.getConstant(ShiftAmt, MVT::i32));
+          // Zero out the rightmost bits.
+          SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U << ShiftAmt),
+                                                         MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SHL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+        }
+        if (Op.getOpcode() == ISD::SRL) {
+          // Make a large shift.
+          SDValue SRL =
+            DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
+                        DAG.getConstant(Intrinsic::x86_avx2_psrli_w, MVT::i32),
+                        R, DAG.getConstant(ShiftAmt, MVT::i32));
+          // Zero out the leftmost bits.
+          SmallVector<SDValue, 32> V(32, DAG.getConstant(uint8_t(-1U) >> ShiftAmt,
+                                                         MVT::i8));
+          return DAG.getNode(ISD::AND, dl, VT, SRL,
+                             DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32));
+        }
+        if (Op.getOpcode() == ISD::SRA) {
+          if (ShiftAmt == 7) {
+            // R s>> 7  ===  R s< 0
+            SDValue Zeros = getZeroVector(VT, true /* HasXMMInt */, DAG, dl);
+            return DAG.getNode(X86ISD::PCMPGTB, dl, VT, Zeros, R);
+          }
+
+          // R s>> a === ((R u>> a) ^ m) - m
+          SDValue Res = DAG.getNode(ISD::SRL, dl, VT, R, Amt);
+          SmallVector<SDValue, 32> V(32, DAG.getConstant(128 >> ShiftAmt,
+                                                         MVT::i8));
+          SDValue Mask = DAG.getNode(ISD::BUILD_VECTOR, dl, VT, &V[0], 32);
+          Res = DAG.getNode(ISD::XOR, dl, VT, Res, Mask);
+          Res = DAG.getNode(ISD::SUB, dl, VT, Res, Mask);
+          return Res;
         }
+      }
     }
   }
 
@@ -10493,9 +10618,9 @@ SDValue X86TargetLowering::LowerXALUO(SDValue Op, SelectionDAG &DAG) const {
 
 SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG) const{
   DebugLoc dl = Op.getDebugLoc();
-  SDNode* Node = Op.getNode();
-  EVT ExtraVT = cast<VTSDNode>(Node->getOperand(1))->getVT();
-  EVT VT = Node->getValueType(0);
+  EVT ExtraVT = cast<VTSDNode>(Op.getOperand(1))->getVT();
+  EVT VT = Op.getValueType();
+
   if (Subtarget->hasXMMInt() && VT.isVector()) {
     unsigned BitsDiff = VT.getScalarType().getSizeInBits() -
                         ExtraVT.getScalarType().getSizeInBits();
@@ -10506,21 +10631,55 @@ SDValue X86TargetLowering::LowerSIGN_EXTEND_INREG(SDValue Op, SelectionDAG &DAG)
     switch (VT.getSimpleVT().SimpleTy) {
       default:
         return SDValue();
-      case MVT::v4i32: {
+      case MVT::v4i32:
         SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_d;
         SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_d;
         break;
-      }
-      case MVT::v8i16: {
+      case MVT::v8i16:
         SHLIntrinsicsID = Intrinsic::x86_sse2_pslli_w;
         SRAIntrinsicsID = Intrinsic::x86_sse2_psrai_w;
         break;
-      }
+      case MVT::v8i32:
+      case MVT::v16i16:
+        if (!Subtarget->hasAVX())
+          return SDValue();
+        if (!Subtarget->hasAVX2()) {
+          // needs to be split
+          int NumElems = VT.getVectorNumElements();
+          SDValue Idx0 = DAG.getConstant(0, MVT::i32);
+          SDValue Idx1 = DAG.getConstant(NumElems/2, MVT::i32);
+
+          // Extract the LHS vectors
+          SDValue LHS = Op.getOperand(0);
+          SDValue LHS1 = Extract128BitVector(LHS, Idx0, DAG, dl);
+          SDValue LHS2 = Extract128BitVector(LHS, Idx1, DAG, dl);
+
+          MVT EltVT = VT.getVectorElementType().getSimpleVT();
+          EVT NewVT = MVT::getVectorVT(EltVT, NumElems/2);
+
+          EVT ExtraEltVT = ExtraVT.getVectorElementType();
+          int ExtraNumElems = ExtraVT.getVectorNumElements();
+          ExtraVT = EVT::getVectorVT(*DAG.getContext(), ExtraEltVT,
+                                     ExtraNumElems/2);
+          SDValue Extra = DAG.getValueType(ExtraVT);
+
+          LHS1 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS1, Extra);
+          LHS2 = DAG.getNode(Op.getOpcode(), dl, NewVT, LHS2, Extra);
+
+          return DAG.getNode(ISD::CONCAT_VECTORS, dl, VT, LHS1, LHS2);;
+        }
+        if (VT == MVT::v8i32) {
+          SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_d;
+          SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_d;
+        } else {
+          SHLIntrinsicsID = Intrinsic::x86_avx2_pslli_w;
+          SRAIntrinsicsID = Intrinsic::x86_avx2_psrai_w;
+        }
     }
 
     SDValue Tmp1 = DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                          DAG.getConstant(SHLIntrinsicsID, MVT::i32),
-                         Node->getOperand(0), ShAmt);
+                         Op.getOperand(0), ShAmt);
 
     return DAG.getNode(ISD::INTRINSIC_WO_CHAIN, dl, VT,
                        DAG.getConstant(SRAIntrinsicsID, MVT::i32),
@@ -11033,9 +11192,7 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::PINSRW:             return "X86ISD::PINSRW";
   case X86ISD::PSHUFB:             return "X86ISD::PSHUFB";
   case X86ISD::ANDNP:              return "X86ISD::ANDNP";
-  case X86ISD::PSIGNB:             return "X86ISD::PSIGNB";
-  case X86ISD::PSIGNW:             return "X86ISD::PSIGNW";
-  case X86ISD::PSIGND:             return "X86ISD::PSIGND";
+  case X86ISD::PSIGN:              return "X86ISD::PSIGN";
   case X86ISD::BLENDV:             return "X86ISD::BLENDV";
   case X86ISD::FHADD:              return "X86ISD::FHADD";
   case X86ISD::FHSUB:              return "X86ISD::FHSUB";
@@ -11111,7 +11268,6 @@ const char *X86TargetLowering::getTargetNodeName(unsigned Opcode) const {
   case X86ISD::MOVSS:              return "X86ISD::MOVSS";
   case X86ISD::UNPCKLPS:           return "X86ISD::UNPCKLPS";
   case X86ISD::UNPCKLPD:           return "X86ISD::UNPCKLPD";
-  case X86ISD::VUNPCKLPDY:         return "X86ISD::VUNPCKLPDY";
   case X86ISD::UNPCKHPS:           return "X86ISD::UNPCKHPS";
   case X86ISD::UNPCKHPD:           return "X86ISD::UNPCKHPD";
   case X86ISD::PUNPCKLBW:          return "X86ISD::PUNPCKLBW";
@@ -11235,7 +11391,7 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
                                       EVT VT) const {
   // Very little shuffling can be done for 64-bit vectors right now.
   if (VT.getSizeInBits() == 64)
-    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX());
+    return isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX());
 
   // FIXME: pshufb, blends, shifts.
   return (VT.getVectorNumElements() == 2 ||
@@ -11245,9 +11401,9 @@ X86TargetLowering::isShuffleMaskLegal(const SmallVectorImpl<int> &M,
           isPSHUFDMask(M, VT) ||
           isPSHUFHWMask(M, VT) ||
           isPSHUFLWMask(M, VT) ||
-          isPALIGNRMask(M, VT, Subtarget->hasSSSE3() || Subtarget->hasAVX()) ||
-          isUNPCKLMask(M, VT) ||
-          isUNPCKHMask(M, VT) ||
+          isPALIGNRMask(M, VT, Subtarget->hasSSSE3orAVX()) ||
+          isUNPCKLMask(M, VT, Subtarget->hasAVX2()) ||
+          isUNPCKHMask(M, VT, Subtarget->hasAVX2()) ||
           isUNPCKL_v_undef_Mask(M, VT) ||
           isUNPCKH_v_undef_Mask(M, VT));
 }
@@ -11654,7 +11810,7 @@ X86TargetLowering::EmitAtomicMinMaxWithCustomInserter(MachineInstr *mInstr,
 MachineBasicBlock *
 X86TargetLowering::EmitPCMP(MachineInstr *MI, MachineBasicBlock *BB,
                             unsigned numArgs, bool memArg) const {
-  assert((Subtarget->hasSSE42() || Subtarget->hasAVX()) &&
+  assert(Subtarget->hasSSE42orAVX() &&
          "Target must have SSE4.2 or AVX features enabled");
 
   DebugLoc dl = MI->getDebugLoc();
@@ -13808,98 +13964,98 @@ static SDValue PerformOrCombine(SDNode *N, SelectionDAG &DAG,
     return R;
 
   EVT VT = N->getValueType(0);
-  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64 && VT != MVT::v2i64)
-    return SDValue();
 
   SDValue N0 = N->getOperand(0);
   SDValue N1 = N->getOperand(1);
 
   // look for psign/blend
-  if (Subtarget->hasSSSE3() || Subtarget->hasAVX()) {
-    if (VT == MVT::v2i64) {
-      // Canonicalize pandn to RHS
-      if (N0.getOpcode() == X86ISD::ANDNP)
-        std::swap(N0, N1);
-      // or (and (m, x), (pandn m, y))
-      if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
-        SDValue Mask = N1.getOperand(0);
-        SDValue X    = N1.getOperand(1);
-        SDValue Y;
-        if (N0.getOperand(0) == Mask)
-          Y = N0.getOperand(1);
-        if (N0.getOperand(1) == Mask)
-          Y = N0.getOperand(0);
-
-        // Check to see if the mask appeared in both the AND and ANDNP and
-        if (!Y.getNode())
-          return SDValue();
+  if (VT == MVT::v2i64 || VT == MVT::v4i64) {
+    if (!Subtarget->hasSSSE3orAVX() ||
+        (VT == MVT::v4i64 && !Subtarget->hasAVX2()))
+      return SDValue();
 
-        // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
-        if (Mask.getOpcode() != ISD::BITCAST ||
-            X.getOpcode() != ISD::BITCAST ||
-            Y.getOpcode() != ISD::BITCAST)
-          return SDValue();
+    // Canonicalize pandn to RHS
+    if (N0.getOpcode() == X86ISD::ANDNP)
+      std::swap(N0, N1);
+    // or (and (m, x), (pandn m, y))
+    if (N0.getOpcode() == ISD::AND && N1.getOpcode() == X86ISD::ANDNP) {
+      SDValue Mask = N1.getOperand(0);
+      SDValue X    = N1.getOperand(1);
+      SDValue Y;
+      if (N0.getOperand(0) == Mask)
+        Y = N0.getOperand(1);
+      if (N0.getOperand(1) == Mask)
+        Y = N0.getOperand(0);
+
+      // Check to see if the mask appeared in both the AND and ANDNP and
+      if (!Y.getNode())
+        return SDValue();
 
-        // Look through mask bitcast.
-        Mask = Mask.getOperand(0);
-        EVT MaskVT = Mask.getValueType();
+      // Validate that X, Y, and Mask are BIT_CONVERTS, and see through them.
+      if (Mask.getOpcode() != ISD::BITCAST ||
+          X.getOpcode() != ISD::BITCAST ||
+          Y.getOpcode() != ISD::BITCAST)
+        return SDValue();
 
-        // Validate that the Mask operand is a vector sra node.  The sra node
-        // will be an intrinsic.
-        if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
-          return SDValue();
+      // Look through mask bitcast.
+      Mask = Mask.getOperand(0);
+      EVT MaskVT = Mask.getValueType();
 
-        // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
-        // there is no psrai.b
-        switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
-        case Intrinsic::x86_sse2_psrai_w:
-        case Intrinsic::x86_sse2_psrai_d:
-          break;
-        default: return SDValue();
-        }
+      // Validate that the Mask operand is a vector sra node.  The sra node
+      // will be an intrinsic.
+      if (Mask.getOpcode() != ISD::INTRINSIC_WO_CHAIN)
+        return SDValue();
 
-        // Check that the SRA is all signbits.
-        SDValue SraC = Mask.getOperand(2);
-        unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
-        unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
-        if ((SraAmt + 1) != EltBits)
-          return SDValue();
+      // FIXME: what to do for bytes, since there is a psignb/pblendvb, but
+      // there is no psrai.b
+      switch (cast<ConstantSDNode>(Mask.getOperand(0))->getZExtValue()) {
+      case Intrinsic::x86_sse2_psrai_w:
+      case Intrinsic::x86_sse2_psrai_d:
+      case Intrinsic::x86_avx2_psrai_w:
+      case Intrinsic::x86_avx2_psrai_d:
+        break;
+      default: return SDValue();
+      }
 
-        DebugLoc DL = N->getDebugLoc();
-
-        // Now we know we at least have a plendvb with the mask val.  See if
-        // we can form a psignb/w/d.
-        // psign = x.type == y.type == mask.type && y = sub(0, x);
-        X = X.getOperand(0);
-        Y = Y.getOperand(0);
-        if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
-            ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
-            X.getValueType() == MaskVT && X.getValueType() == Y.getValueType()){
-          unsigned Opc = 0;
-          switch (EltBits) {
-          case 8: Opc = X86ISD::PSIGNB; break;
-          case 16: Opc = X86ISD::PSIGNW; break;
-          case 32: Opc = X86ISD::PSIGND; break;
-          default: break;
-          }
-          if (Opc) {
-            SDValue Sign = DAG.getNode(Opc, DL, MaskVT, X, Mask.getOperand(1));
-            return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Sign);
-          }
-        }
-        // PBLENDVB only available on SSE 4.1
-        if (!(Subtarget->hasSSE41() || Subtarget->hasAVX()))
-          return SDValue();
+      // Check that the SRA is all signbits.
+      SDValue SraC = Mask.getOperand(2);
+      unsigned SraAmt  = cast<ConstantSDNode>(SraC)->getZExtValue();
+      unsigned EltBits = MaskVT.getVectorElementType().getSizeInBits();
+      if ((SraAmt + 1) != EltBits)
+        return SDValue();
 
-        X = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, X);
-        Y = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Y);
-        Mask = DAG.getNode(ISD::BITCAST, DL, MVT::v16i8, Mask);
-        Mask = DAG.getNode(ISD::VSELECT, DL, MVT::v16i8, Mask, X, Y);
-        return DAG.getNode(ISD::BITCAST, DL, MVT::v2i64, Mask);
+      DebugLoc DL = N->getDebugLoc();
+
+      // Now we know we at least have a plendvb with the mask val.  See if
+      // we can form a psignb/w/d.
+      // psign = x.type == y.type == mask.type && y = sub(0, x);
+      X = X.getOperand(0);
+      Y = Y.getOperand(0);
+      if (Y.getOpcode() == ISD::SUB && Y.getOperand(1) == X &&
+          ISD::isBuildVectorAllZeros(Y.getOperand(0).getNode()) &&
+          X.getValueType() == MaskVT && X.getValueType() == Y.getValueType() &&
+          (EltBits == 8 || EltBits == 16 || EltBits == 32)) {
+        SDValue Sign = DAG.getNode(X86ISD::PSIGN, DL, MaskVT, X,
+                                   Mask.getOperand(1));
+        return DAG.getNode(ISD::BITCAST, DL, VT, Sign);
       }
+      // PBLENDVB only available on SSE 4.1
+      if (!Subtarget->hasSSE41orAVX())
+        return SDValue();
+
+      EVT BlendVT = (VT == MVT::v4i64) ? MVT::v32i8 : MVT::v16i8;
+
+      X = DAG.getNode(ISD::BITCAST, DL, BlendVT, X);
+      Y = DAG.getNode(ISD::BITCAST, DL, BlendVT, Y);
+      Mask = DAG.getNode(ISD::BITCAST, DL, BlendVT, Mask);
+      Mask = DAG.getNode(ISD::VSELECT, DL, BlendVT, Mask, X, Y);
+      return DAG.getNode(ISD::BITCAST, DL, VT, Mask);
     }
   }
 
+  if (VT != MVT::i16 && VT != MVT::i32 && VT != MVT::i64)
+    return SDValue();
+
   // fold (or (x << c) | (y >> (64 - c))) ==> (shld64 x, y, c)
   if (N0.getOpcode() == ISD::SRL && N1.getOpcode() == ISD::SHL)
     std::swap(N0, N1);
@@ -14409,8 +14565,7 @@ static SDValue PerformFADDCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Try to synthesize horizontal adds from adds of shuffles.
-  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
-      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+  if (Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64) &&
       isHorizontalBinOp(LHS, RHS, true))
     return DAG.getNode(X86ISD::FHADD, N->getDebugLoc(), VT, LHS, RHS);
   return SDValue();
@@ -14424,8 +14579,7 @@ static SDValue PerformFSUBCombine(SDNode *N, SelectionDAG &DAG,
   SDValue RHS = N->getOperand(1);
 
   // Try to synthesize horizontal subs from subs of shuffles.
-  if ((Subtarget->hasSSE3() || Subtarget->hasAVX()) &&
-      (VT == MVT::v4f32 || VT == MVT::v2f64) &&
+  if (Subtarget->hasSSE3orAVX() && (VT == MVT::v4f32 || VT == MVT::v2f64) &&
       isHorizontalBinOp(LHS, RHS, false))
     return DAG.getNode(X86ISD::FHSUB, N->getDebugLoc(), VT, LHS, RHS);
   return SDValue();
@@ -14621,7 +14775,23 @@ static SDValue OptimizeConditionalInDecrement(SDNode *N, SelectionDAG &DAG) {
                      DAG.getConstant(0, OtherVal.getValueType()), NewCmp);
 }
 
-static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
+/// PerformADDCombine - Do target-specific dag combines on integer adds.
+static SDValue PerformAddCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
+  EVT VT = N->getValueType(0);
+  SDValue Op0 = N->getOperand(0);
+  SDValue Op1 = N->getOperand(1);
+
+  // Try to synthesize horizontal adds from adds of shuffles.
+  if ((Subtarget->hasSSSE3orAVX()) && (VT == MVT::v8i16 || VT == MVT::v4i32) &&
+      isHorizontalBinOp(Op0, Op1, true))
+    return DAG.getNode(X86ISD::HADD, N->getDebugLoc(), VT, Op0, Op1);
+
+  return OptimizeConditionalInDecrement(N, DAG);
+}
+
+static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG,
+                                 const X86Subtarget *Subtarget) {
   SDValue Op0 = N->getOperand(0);
   SDValue Op1 = N->getOperand(1);
 
@@ -14643,6 +14813,12 @@ static SDValue PerformSubCombine(SDNode *N, SelectionDAG &DAG) {
     }
   }
 
+  // Try to synthesize horizontal adds from adds of shuffles.
+  EVT VT = N->getValueType(0);
+  if ((Subtarget->hasSSSE3orAVX()) && (VT == MVT::v8i16 || VT == MVT::v4i32) &&
+      isHorizontalBinOp(Op0, Op1, false))
+    return DAG.getNode(X86ISD::HSUB, N->getDebugLoc(), VT, Op0, Op1);
+
   return OptimizeConditionalInDecrement(N, DAG);
 }
 
@@ -14656,8 +14832,8 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case ISD::VSELECT:
   case ISD::SELECT:         return PerformSELECTCombine(N, DAG, Subtarget);
   case X86ISD::CMOV:        return PerformCMOVCombine(N, DAG, DCI);
-  case ISD::ADD:            return OptimizeConditionalInDecrement(N, DAG);
-  case ISD::SUB:            return PerformSubCombine(N, DAG);
+  case ISD::ADD:            return PerformAddCombine(N, DAG, Subtarget);
+  case ISD::SUB:            return PerformSubCombine(N, DAG, Subtarget);
   case X86ISD::ADC:         return PerformADCCombine(N, DAG, DCI);
   case ISD::MUL:            return PerformMulCombine(N, DAG, DCI);
   case ISD::SHL:
@@ -14687,16 +14863,12 @@ SDValue X86TargetLowering::PerformDAGCombine(SDNode *N,
   case X86ISD::PUNPCKHQDQ:
   case X86ISD::UNPCKHPS:
   case X86ISD::UNPCKHPD:
-  case X86ISD::VUNPCKHPSY:
-  case X86ISD::VUNPCKHPDY:
   case X86ISD::PUNPCKLBW:
   case X86ISD::PUNPCKLWD:
   case X86ISD::PUNPCKLDQ:
   case X86ISD::PUNPCKLQDQ:
   case X86ISD::UNPCKLPS:
   case X86ISD::UNPCKLPD:
-  case X86ISD::VUNPCKLPSY:
-  case X86ISD::VUNPCKLPDY:
   case X86ISD::MOVHLPS:
   case X86ISD::MOVLHPS:
   case X86ISD::PSHUFD:
diff --git a/lib/Target/X86/X86ISelLowering.h b/lib/Target/X86/X86ISelLowering.h
index 3b7a14d..ccff3a5 100644
--- a/lib/Target/X86/X86ISelLowering.h
+++ b/lib/Target/X86/X86ISelLowering.h
@@ -172,12 +172,18 @@ namespace llvm {
       /// ANDNP - Bitwise Logical AND NOT of Packed FP values.
       ANDNP,
 
-      /// PSIGNB/W/D - Copy integer sign.
-      PSIGNB, PSIGNW, PSIGND,
+      /// PSIGN - Copy integer sign.
+      PSIGN,
 
       /// BLEND family of opcodes
       BLENDV,
 
+      /// HADD - Integer horizontal add.
+      HADD,
+
+      /// HSUB - Integer horizontal sub.
+      HSUB,
+
       /// FHADD - Floating point horizontal add.
       FHADD,
 
@@ -269,12 +275,8 @@ namespace llvm {
       MOVSS,
       UNPCKLPS,
       UNPCKLPD,
-      VUNPCKLPSY,
-      VUNPCKLPDY,
       UNPCKHPS,
       UNPCKHPD,
-      VUNPCKHPSY,
-      VUNPCKHPDY,
       PUNPCKLBW,
       PUNPCKLWD,
       PUNPCKLDQ,
@@ -408,11 +410,13 @@ namespace llvm {
 
     /// isUNPCKLMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to UNPCKL.
-    bool isUNPCKLMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+    bool isUNPCKLMask(ShuffleVectorSDNode *N, bool HasAVX2,
+                      bool V2IsSplat = false);
 
     /// isUNPCKHMask - Return true if the specified VECTOR_SHUFFLE operand
     /// specifies a shuffle of elements that is suitable for input to UNPCKH.
-    bool isUNPCKHMask(ShuffleVectorSDNode *N, bool V2IsSplat = false);
+    bool isUNPCKHMask(ShuffleVectorSDNode *N, bool HasAVX2,
+                      bool V2IsSplat = false);
 
     /// isUNPCKL_v_undef_Mask - Special case of isUNPCKLMask for canonical form
     /// of vector_shuffle v, v, <0, 4, 1, 5>, i.e. vector_shuffle v, undef,
diff --git a/lib/Target/X86/X86InstrBuilder.h b/lib/Target/X86/X86InstrBuilder.h
index 0245e5c..fa1d676 100644
--- a/lib/Target/X86/X86InstrBuilder.h
+++ b/lib/Target/X86/X86InstrBuilder.h
@@ -27,7 +27,6 @@
 #include "llvm/CodeGen/MachineFrameInfo.h"
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineMemOperand.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 
 namespace llvm {
 
diff --git a/lib/Target/X86/X86InstrFragmentsSIMD.td b/lib/Target/X86/X86InstrFragmentsSIMD.td
index 6fd2efd..791bbe6 100644
--- a/lib/Target/X86/X86InstrFragmentsSIMD.td
+++ b/lib/Target/X86/X86InstrFragmentsSIMD.td
@@ -41,6 +41,8 @@ def X86fsrl    : SDNode<"X86ISD::FSRL",      SDTX86FPShiftOp>;
 def X86fgetsign: SDNode<"X86ISD::FGETSIGNx86",SDTFPToIntOp>;
 def X86fhadd   : SDNode<"X86ISD::FHADD",     SDTFPBinOp>;
 def X86fhsub   : SDNode<"X86ISD::FHSUB",     SDTFPBinOp>;
+def X86hadd    : SDNode<"X86ISD::HADD",      SDTIntBinOp>;
+def X86hsub    : SDNode<"X86ISD::HSUB",      SDTIntBinOp>;
 def X86comi    : SDNode<"X86ISD::COMI",      SDTX86CmpTest>;
 def X86ucomi   : SDNode<"X86ISD::UCOMI",     SDTX86CmpTest>;
 def X86cmpss   : SDNode<"X86ISD::FSETCCss",    SDTX86Cmpss>;
@@ -51,14 +53,8 @@ def X86pshufb  : SDNode<"X86ISD::PSHUFB",
 def X86andnp   : SDNode<"X86ISD::ANDNP",
                  SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
-def X86psignb  : SDNode<"X86ISD::PSIGNB",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, v16i8>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>]>>;
-def X86psignw  : SDNode<"X86ISD::PSIGNW",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, v8i16>, SDTCisSameAs<0,1>,
-                                      SDTCisSameAs<0,2>]>>;
-def X86psignd  : SDNode<"X86ISD::PSIGND",
-                 SDTypeProfile<1, 2, [SDTCisVT<0, v4i32>, SDTCisSameAs<0,1>,
+def X86psign   : SDNode<"X86ISD::PSIGN",
+                 SDTypeProfile<1, 2, [SDTCisVec<0>, SDTCisSameAs<0,1>,
                                       SDTCisSameAs<0,2>]>>;
 def X86pextrb  : SDNode<"X86ISD::PEXTRB",
                  SDTypeProfile<1, 2, [SDTCisVT<0, i32>, SDTCisPtrTy<2>]>>;
@@ -136,13 +132,9 @@ def X86Movlpd : SDNode<"X86ISD::MOVLPD", SDTShuff2Op>;
 
 def X86Unpcklps  : SDNode<"X86ISD::UNPCKLPS", SDTShuff2Op>;
 def X86Unpcklpd  : SDNode<"X86ISD::UNPCKLPD", SDTShuff2Op>;
-def X86Unpcklpsy : SDNode<"X86ISD::VUNPCKLPSY", SDTShuff2Op>;
-def X86Unpcklpdy : SDNode<"X86ISD::VUNPCKLPDY", SDTShuff2Op>;
 
 def X86Unpckhps  : SDNode<"X86ISD::UNPCKHPS", SDTShuff2Op>;
 def X86Unpckhpd  : SDNode<"X86ISD::UNPCKHPD", SDTShuff2Op>;
-def X86Unpckhpsy : SDNode<"X86ISD::VUNPCKHPSY", SDTShuff2Op>;
-def X86Unpckhpdy : SDNode<"X86ISD::VUNPCKHPDY", SDTShuff2Op>;
 
 def X86Punpcklbw  : SDNode<"X86ISD::PUNPCKLBW", SDTShuff2Op>;
 def X86Punpcklwd  : SDNode<"X86ISD::PUNPCKLWD", SDTShuff2Op>;
@@ -427,12 +419,12 @@ def movl : PatFrag<(ops node:$lhs, node:$rhs),
 
 def unpckl : PatFrag<(ops node:$lhs, node:$rhs),
                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N));
+  return X86::isUNPCKLMask(cast<ShuffleVectorSDNode>(N), Subtarget->hasAVX2());
 }]>;
 
 def unpckh : PatFrag<(ops node:$lhs, node:$rhs),
                      (vector_shuffle node:$lhs, node:$rhs), [{
-  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N));
+  return X86::isUNPCKHMask(cast<ShuffleVectorSDNode>(N), Subtarget->hasAVX2());
 }]>;
 
 def pshufd : PatFrag<(ops node:$lhs, node:$rhs),
diff --git a/lib/Target/X86/X86InstrInfo.cpp b/lib/Target/X86/X86InstrInfo.cpp
index 9428fff..24c4a53 100644
--- a/lib/Target/X86/X86InstrInfo.cpp
+++ b/lib/Target/X86/X86InstrInfo.cpp
@@ -25,7 +25,6 @@
 #include "llvm/CodeGen/MachineInstrBuilder.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
 #include "llvm/CodeGen/LiveVariables.h"
-#include "llvm/CodeGen/PseudoSourceValue.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -2903,6 +2902,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     switch (LoadMI->getOpcode()) {
     case X86::AVX_SET0PSY:
     case X86::AVX_SET0PDY:
+    case X86::AVX2_SETALLONES:
       Alignment = 32;
       break;
     case X86::V_SET0:
@@ -2948,6 +2948,7 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
   case X86::AVX_SET0PSY:
   case X86::AVX_SET0PDY:
   case X86::AVX_SETALLONES:
+  case X86::AVX2_SETALLONES:
   case X86::FsFLD0SD:
   case X86::FsFLD0SS:
   case X86::VFsFLD0SD:
@@ -2986,7 +2987,8 @@ MachineInstr* X86InstrInfo::foldMemoryOperandImpl(MachineFunction &MF,
     else
       Ty = VectorType::get(Type::getInt32Ty(MF.getFunction()->getContext()), 4);
 
-    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES);
+    bool IsAllOnes = (Opc == X86::V_SETALLONES || Opc == X86::AVX_SETALLONES ||
+                      Opc == X86::AVX2_SETALLONES);
     const Constant *C = IsAllOnes ? Constant::getAllOnesValue(Ty) :
                                     Constant::getNullValue(Ty);
     unsigned CPI = MCP.getConstantPoolIndex(C, Alignment);
@@ -3555,7 +3557,11 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::VMOVAPSYrr,   X86::VMOVAPDYrr,   X86::VMOVDQAYrr  },
   { X86::VMOVUPSYmr,   X86::VMOVUPDYmr,   X86::VMOVDQUYmr  },
   { X86::VMOVUPSYrm,   X86::VMOVUPDYrm,   X86::VMOVDQUYrm  },
-  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr },
+  { X86::VMOVNTPSYmr,  X86::VMOVNTPDYmr,  X86::VMOVNTDQYmr }
+};
+
+static const unsigned ReplaceableInstrsAVX2[][3] = {
+  //PackedSingle       PackedDouble       PackedInt
   { X86::VANDNPSYrm,   X86::VANDNPDYrm,   X86::VPANDNYrm   },
   { X86::VANDNPSYrr,   X86::VANDNPDYrr,   X86::VPANDNYrr   },
   { X86::VANDPSYrm,    X86::VANDPDYrm,    X86::VPANDYrm    },
@@ -3563,7 +3569,7 @@ static const unsigned ReplaceableInstrs[][3] = {
   { X86::VORPSYrm,     X86::VORPDYrm,     X86::VPORYrm     },
   { X86::VORPSYrr,     X86::VORPDYrr,     X86::VPORYrr     },
   { X86::VXORPSYrm,    X86::VXORPDYrm,    X86::VPXORYrm    },
-  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    },
+  { X86::VXORPSYrr,    X86::VXORPDYrr,    X86::VPXORYrr    }
 };
 
 // FIXME: Some shuffle and unpack instructions have equivalents in different
@@ -3576,11 +3582,23 @@ static const unsigned *lookup(unsigned opcode, unsigned domain) {
   return 0;
 }
 
+static const unsigned *lookupAVX2(unsigned opcode, unsigned domain) {
+  for (unsigned i = 0, e = array_lengthof(ReplaceableInstrsAVX2); i != e; ++i)
+    if (ReplaceableInstrsAVX2[i][domain-1] == opcode)
+      return ReplaceableInstrsAVX2[i];
+  return 0;
+}
+
 std::pair<uint16_t, uint16_t>
 X86InstrInfo::getExecutionDomain(const MachineInstr *MI) const {
   uint16_t domain = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
-  return std::make_pair(domain,
-                        domain && lookup(MI->getOpcode(), domain) ? 0xe : 0);
+  bool hasAVX2 = TM.getSubtarget<X86Subtarget>().hasAVX2();
+  uint16_t validDomains = 0;
+  if (domain && lookup(MI->getOpcode(), domain))
+    validDomains = 0xe;
+  else if (domain && lookupAVX2(MI->getOpcode(), domain))
+    validDomains = hasAVX2 ? 0xe : 0x6;
+  return std::make_pair(domain, validDomains);
 }
 
 void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
@@ -3588,6 +3606,11 @@ void X86InstrInfo::setExecutionDomain(MachineInstr *MI, unsigned Domain) const {
   uint16_t dom = (MI->getDesc().TSFlags >> X86II::SSEDomainShift) & 3;
   assert(dom && "Not an SSE instruction");
   const unsigned *table = lookup(MI->getOpcode(), dom);
+  if (!table) { // try the other table
+    assert((TM.getSubtarget<X86Subtarget>().hasAVX2() || Domain < 3) &&
+           "256-bit vector operations only available in AVX2");
+    table = lookupAVX2(MI->getOpcode(), dom);
+  }
   assert(table && "Cannot change domain");
   MI->setDesc(get(table[Domain-1]));
 }
diff --git a/lib/Target/X86/X86InstrInfo.td b/lib/Target/X86/X86InstrInfo.td
index 79ce509..35631d5 100644
--- a/lib/Target/X86/X86InstrInfo.td
+++ b/lib/Target/X86/X86InstrInfo.td
@@ -1523,10 +1523,11 @@ def : MnemonicAlias<"call", "calll">, Requires<[In32BitMode]>;
 def : MnemonicAlias<"call", "callq">, Requires<[In64BitMode]>;
 
 def : MnemonicAlias<"cbw",  "cbtw">;
+def : MnemonicAlias<"cwde", "cwtl">;
 def : MnemonicAlias<"cwd",  "cwtd">;
 def : MnemonicAlias<"cdq", "cltd">;
-def : MnemonicAlias<"cwde", "cwtl">;
 def : MnemonicAlias<"cdqe", "cltq">;
+def : MnemonicAlias<"cqo", "cqto">;
 
 // lret maps to lretl, it is not ambiguous with lretq.
 def : MnemonicAlias<"lret", "lretl">;
diff --git a/lib/Target/X86/X86InstrSSE.td b/lib/Target/X86/X86InstrSSE.td
index 6deee4f..7cadac1 100644
--- a/lib/Target/X86/X86InstrSSE.td
+++ b/lib/Target/X86/X86InstrSSE.td
@@ -311,13 +311,16 @@ def : Pat<(bc_v4i64 (v8f32 immAllZerosV)),
 // JIT implementation, it does not expand the instructions below like
 // X86MCInstLower does.
 let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in
+    isCodeGenOnly = 1, ExeDomain = SSEPackedInt in {
   def V_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                          [(set VR128:$dst, (v4i32 immAllOnesV))]>;
-let isReMaterializable = 1, isAsCheapAsAMove = 1, canFoldAsLoad = 1,
-    isCodeGenOnly = 1, ExeDomain = SSEPackedInt, Predicates = [HasAVX] in
+  let Predicates = [HasAVX] in
   def AVX_SETALLONES : PDI<0x76, MRMInitReg, (outs VR128:$dst), (ins), "",
                          [(set VR128:$dst, (v4i32 immAllOnesV))]>, VEX_4V;
+  let Predicates = [HasAVX2] in
+  def AVX2_SETALLONES : PDI<0x76, MRMInitReg, (outs VR256:$dst), (ins), "",
+                          [(set VR256:$dst, (v8i32 immAllOnesV))]>, VEX_4V;
+}
 
 
 //===----------------------------------------------------------------------===//
@@ -522,6 +525,8 @@ let Predicates = [HasSSE2] in {
   // fold opportunity reappears.
   def : Pat<(v2f64 (X86Movlpd VR128:$src1, VR128:$src2)),
             (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2f64 VR128:$src2),sub_sd))>;
+  def : Pat<(v2i64 (X86Movlpd VR128:$src1, VR128:$src2)),
+            (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v2i64 VR128:$src2),sub_sd))>;
   def : Pat<(v4f32 (X86Movlps VR128:$src1, VR128:$src2)),
             (MOVSDrr VR128:$src1, (EXTRACT_SUBREG (v4f32 VR128:$src2),sub_sd))>;
   def : Pat<(v4i32 (X86Movlps VR128:$src1, VR128:$src2)),
@@ -2467,21 +2472,21 @@ let Predicates = [HasAVX] in {
   def : Pat<(v4f32 (X86Unpckhps VR128:$src1, VR128:$src2)),
             (VUNPCKHPSrr VR128:$src1, VR128:$src2)>;
 
-  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, (memopv8f32 addr:$src2))),
+  def : Pat<(v8f32 (X86Unpcklps VR256:$src1, (memopv8f32 addr:$src2))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8f32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+  def : Pat<(v8f32 (X86Unpcklps VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, VR256:$src2)),
+  def : Pat<(v8i32 (X86Unpcklps VR256:$src1, VR256:$src2)),
             (VUNPCKLPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpcklpsy VR256:$src1, (memopv8i32 addr:$src2))),
+  def : Pat<(v8i32 (X86Unpcklps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
             (VUNPCKLPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, (memopv8f32 addr:$src2))),
+  def : Pat<(v8f32 (X86Unpckhps VR256:$src1, (memopv8f32 addr:$src2))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8f32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
+  def : Pat<(v8f32 (X86Unpckhps VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, (memopv8i32 addr:$src2))),
+  def : Pat<(v8i32 (X86Unpckhps VR256:$src1, (bc_v8i32 (memopv4i64 addr:$src2)))),
             (VUNPCKHPSYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (X86Unpckhpsy VR256:$src1, VR256:$src2)),
+  def : Pat<(v8i32 (X86Unpckhps VR256:$src1, VR256:$src2)),
             (VUNPCKHPSYrr VR256:$src1, VR256:$src2)>;
 
   def : Pat<(v2f64 (X86Unpcklpd VR128:$src1, (memopv2f64 addr:$src2))),
@@ -2493,21 +2498,21 @@ let Predicates = [HasAVX] in {
   def : Pat<(v2f64 (X86Unpckhpd VR128:$src1, VR128:$src2)),
             (VUNPCKHPDrr VR128:$src1, VR128:$src2)>;
 
-  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, (memopv4f64 addr:$src2))),
+  def : Pat<(v4f64 (X86Unpcklpd VR256:$src1, (memopv4f64 addr:$src2))),
             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4f64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+  def : Pat<(v4f64 (X86Unpcklpd VR256:$src1, VR256:$src2)),
             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpcklpd VR256:$src1, (memopv4i64 addr:$src2))),
             (VUNPCKLPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpcklpdy VR256:$src1, VR256:$src2)),
+  def : Pat<(v4i64 (X86Unpcklpd VR256:$src1, VR256:$src2)),
             (VUNPCKLPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, (memopv4f64 addr:$src2))),
+  def : Pat<(v4f64 (X86Unpckhpd VR256:$src1, (memopv4f64 addr:$src2))),
             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4f64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
+  def : Pat<(v4f64 (X86Unpckhpd VR256:$src1, VR256:$src2)),
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, (memopv4i64 addr:$src2))),
+  def : Pat<(v4i64 (X86Unpckhpd VR256:$src1, (memopv4i64 addr:$src2))),
             (VUNPCKHPDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (X86Unpckhpdy VR256:$src1, VR256:$src2)),
+  def : Pat<(v4i64 (X86Unpckhpd VR256:$src1, VR256:$src2)),
             (VUNPCKHPDYrr VR256:$src1, VR256:$src2)>;
 
   // FIXME: Instead of X86Movddup, there should be a X86Unpcklpd here, the
@@ -3421,47 +3426,6 @@ multiclass PDI_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
        [(set RC:$dst, (OpVT (OpNode RC:$src1,
                                      (bitconvert (memop_frag addr:$src2)))))]>;
 }
-
-/// PDI_binop_rm_v2i64 - Simple SSE2 binary operator whose type is v2i64.
-///
-/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
-/// to collapse (bitconvert VT to VT) into its operand.
-///
-multiclass PDI_binop_rm_v2i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              bit IsCommutable = 0, bit Is2Addr = 1> {
-  let isCommutable = IsCommutable in
-  def rr : PDI<opc, MRMSrcReg, (outs VR128:$dst),
-       (ins VR128:$src1, VR128:$src2),
-       !if(Is2Addr,
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (v2i64 (OpNode VR128:$src1, VR128:$src2)))]>;
-  def rm : PDI<opc, MRMSrcMem, (outs VR128:$dst),
-       (ins VR128:$src1, i128mem:$src2),
-       !if(Is2Addr,
-           !strconcat(OpcodeStr, "\t{$src2, $dst|$dst, $src2}"),
-           !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}")),
-       [(set VR128:$dst, (OpNode VR128:$src1, (memopv2i64 addr:$src2)))]>;
-}
-
-/// PDI_binop_rm_v4i64 - Simple AVX2 binary operator whose type is v4i64.
-///
-/// FIXME: we could eliminate this and use PDI_binop_rm instead if tblgen knew
-/// to collapse (bitconvert VT to VT) into its operand.
-///
-multiclass PDI_binop_rm_v4i64<bits<8> opc, string OpcodeStr, SDNode OpNode,
-                              bit IsCommutable = 0> {
-  let isCommutable = IsCommutable in
-  def rr : PDI<opc, MRMSrcReg, (outs VR256:$dst),
-       (ins VR256:$src1, VR256:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set VR256:$dst, (v4i64 (OpNode VR256:$src1, VR256:$src2)))]>;
-  def rm : PDI<opc, MRMSrcMem, (outs VR256:$dst),
-       (ins VR256:$src1, i256mem:$src2),
-       !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-       [(set VR256:$dst, (OpNode VR256:$src1, (memopv4i64 addr:$src2)))]>;
-}
-
 } // ExeDomain = SSEPackedInt
 
 // 128-bit Integer Arithmetic
@@ -3473,7 +3437,8 @@ defm VPADDW  : PDI_binop_rm<0xFD, "vpaddw", add, v8i16, VR128, memopv2i64,
                             i128mem, 1, 0>, VEX_4V;
 defm VPADDD  : PDI_binop_rm<0xFE, "vpaddd", add, v4i32, VR128, memopv2i64,
                             i128mem, 1, 0>, VEX_4V;
-defm VPADDQ  : PDI_binop_rm_v2i64<0xD4, "vpaddq", add, 1, 0>, VEX_4V;
+defm VPADDQ  : PDI_binop_rm<0xD4, "vpaddq", add, v2i64, VR128, memopv2i64,
+                            i128mem, 1, 0>, VEX_4V;
 defm VPMULLW : PDI_binop_rm<0xD5, "vpmullw", mul, v8i16, VR128, memopv2i64,
                             i128mem, 1, 0>, VEX_4V;
 defm VPSUBB : PDI_binop_rm<0xF8, "vpsubb", sub, v16i8, VR128, memopv2i64,
@@ -3482,7 +3447,8 @@ defm VPSUBW : PDI_binop_rm<0xF9, "vpsubw", sub, v8i16, VR128, memopv2i64,
                             i128mem, 0, 0>, VEX_4V;
 defm VPSUBD : PDI_binop_rm<0xFA, "vpsubd", sub, v4i32, VR128, memopv2i64,
                             i128mem, 0, 0>, VEX_4V;
-defm VPSUBQ : PDI_binop_rm_v2i64<0xFB, "vpsubq", sub, 0, 0>, VEX_4V;
+defm VPSUBQ : PDI_binop_rm<0xFB, "vpsubq", sub, v2i64, VR128, memopv2i64,
+                            i128mem, 0, 0>, VEX_4V;
 
 // Intrinsic forms
 defm VPSUBSB  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_sse2_psubs_b,
@@ -3527,21 +3493,23 @@ defm VPSADBW  : PDI_binop_rm_int<0xF6, "vpsadbw", int_x86_sse2_psad_bw,
 
 let Predicates = [HasAVX2] in {
 defm VPADDBY  : PDI_binop_rm<0xFC, "vpaddb", add, v32i8, VR256, memopv4i64,
-                            i256mem, 1, 0>, VEX_4V;
+                             i256mem, 1, 0>, VEX_4V;
 defm VPADDWY  : PDI_binop_rm<0xFD, "vpaddw", add, v16i16, VR256, memopv4i64,
-                            i256mem, 1, 0>, VEX_4V;
+                             i256mem, 1, 0>, VEX_4V;
 defm VPADDDY  : PDI_binop_rm<0xFE, "vpaddd", add, v8i32, VR256, memopv4i64,
-                            i256mem, 1, 0>, VEX_4V;
-defm VPADDQY  : PDI_binop_rm_v4i64<0xD4, "vpaddq", add, 1>, VEX_4V;
+                             i256mem, 1, 0>, VEX_4V;
+defm VPADDQY  : PDI_binop_rm<0xD4, "vpaddq", add, v4i64, VR256, memopv4i64,
+                             i256mem, 1, 0>, VEX_4V;
 defm VPMULLWY : PDI_binop_rm<0xD5, "vpmullw", mul, v16i16, VR256, memopv4i64,
-                            i256mem, 1, 0>, VEX_4V;
+                             i256mem, 1, 0>, VEX_4V;
 defm VPSUBBY  : PDI_binop_rm<0xF8, "vpsubb", sub, v32i8, VR256, memopv4i64,
-                            i256mem, 0, 0>, VEX_4V;
+                             i256mem, 0, 0>, VEX_4V;
 defm VPSUBWY  : PDI_binop_rm<0xF9, "vpsubw", sub, v16i16,VR256, memopv4i64,
-                            i256mem,  0, 0>, VEX_4V;
+                             i256mem, 0, 0>, VEX_4V;
 defm VPSUBDY  : PDI_binop_rm<0xFA, "vpsubd", sub, v8i32, VR256, memopv4i64,
-                            i256mem, 0, 0>, VEX_4V;
-defm VPSUBQY  : PDI_binop_rm_v4i64<0xFB, "vpsubq", sub, 0>, VEX_4V;
+                             i256mem, 0, 0>, VEX_4V;
+defm VPSUBQY  : PDI_binop_rm<0xFB, "vpsubq", sub, v4i64, VR256, memopv4i64,
+                             i256mem, 0, 0>, VEX_4V;
 
 // Intrinsic forms
 defm VPSUBSBY  : PDI_binop_rm_int<0xE8, "vpsubsb" , int_x86_avx2_psubs_b,
@@ -3591,7 +3559,8 @@ defm PADDW  : PDI_binop_rm<0xFD, "paddw", add, v8i16, VR128, memopv2i64,
                            i128mem, 1>;
 defm PADDD  : PDI_binop_rm<0xFE, "paddd", add, v4i32, VR128, memopv2i64,
                            i128mem, 1>;
-defm PADDQ  : PDI_binop_rm_v2i64<0xD4, "paddq", add, 1>;
+defm PADDQ  : PDI_binop_rm<0xD4, "paddq", add, v2i64, VR128, memopv2i64,
+                           i128mem, 1>;
 defm PMULLW : PDI_binop_rm<0xD5, "pmullw", mul, v8i16, VR128, memopv2i64,
                            i128mem, 1>;
 defm PSUBB : PDI_binop_rm<0xF8, "psubb", sub, v16i8, VR128, memopv2i64,
@@ -3600,7 +3569,8 @@ defm PSUBW : PDI_binop_rm<0xF9, "psubw", sub, v8i16, VR128, memopv2i64,
                           i128mem>;
 defm PSUBD : PDI_binop_rm<0xFA, "psubd", sub, v4i32, VR128, memopv2i64,
                           i128mem>;
-defm PSUBQ : PDI_binop_rm_v2i64<0xFB, "psubq", sub>;
+defm PSUBQ : PDI_binop_rm<0xFB, "psubq", sub, v2i64, VR128, memopv2i64,
+                          i128mem>;
 
 // Intrinsic forms
 defm PSUBSB  : PDI_binop_rm_int<0xE8, "psubsb" , int_x86_sse2_psubs_b,
@@ -3676,9 +3646,12 @@ defm VPSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad",
                                 int_x86_sse2_psra_d, int_x86_sse2_psrai_d,
                                 VR128, 0>, VEX_4V;
 
-defm VPAND : PDI_binop_rm_v2i64<0xDB, "vpand", and, 1, 0>, VEX_4V;
-defm VPOR  : PDI_binop_rm_v2i64<0xEB, "vpor" , or, 1, 0>, VEX_4V;
-defm VPXOR : PDI_binop_rm_v2i64<0xEF, "vpxor", xor, 1, 0>, VEX_4V;
+defm VPAND : PDI_binop_rm<0xDB, "vpand", and, v2i64, VR128, memopv2i64,
+                          i128mem, 1, 0>, VEX_4V;
+defm VPOR  : PDI_binop_rm<0xEB, "vpor" , or, v2i64, VR128, memopv2i64,
+                          i128mem, 1, 0>, VEX_4V;
+defm VPXOR : PDI_binop_rm<0xEF, "vpxor", xor, v2i64, VR128, memopv2i64,
+                          i128mem, 1, 0>, VEX_4V;
 
 let ExeDomain = SSEPackedInt in {
   let neverHasSideEffects = 1 in {
@@ -3735,9 +3708,12 @@ defm VPSRADY : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "vpsrad",
                                  int_x86_avx2_psra_d, int_x86_avx2_psrai_d,
                                  VR256, 0>, VEX_4V;
 
-defm VPANDY : PDI_binop_rm_v4i64<0xDB, "vpand", and, 1>, VEX_4V;
-defm VPORY  : PDI_binop_rm_v4i64<0xEB, "vpor" , or, 1>, VEX_4V;
-defm VPXORY : PDI_binop_rm_v4i64<0xEF, "vpxor", xor, 1>, VEX_4V;
+defm VPANDY : PDI_binop_rm<0xDB, "vpand", and, v4i64, VR256, memopv4i64,
+                           i256mem, 1, 0>, VEX_4V;
+defm VPORY  : PDI_binop_rm<0xEB, "vpor", or, v4i64, VR256, memopv4i64,
+                           i256mem, 1, 0>, VEX_4V;
+defm VPXORY : PDI_binop_rm<0xEF, "vpxor", xor, v4i64, VR256, memopv4i64,
+                           i256mem, 1, 0>, VEX_4V;
 
 let ExeDomain = SSEPackedInt in {
   let neverHasSideEffects = 1 in {
@@ -3794,9 +3770,12 @@ defm PSRAD : PDI_binop_rmi_int<0xE2, 0x72, MRM4r, "psrad",
                                int_x86_sse2_psra_d, int_x86_sse2_psrai_d,
                                VR128>;
 
-defm PAND : PDI_binop_rm_v2i64<0xDB, "pand", and, 1>;
-defm POR  : PDI_binop_rm_v2i64<0xEB, "por" , or, 1>;
-defm PXOR : PDI_binop_rm_v2i64<0xEF, "pxor", xor, 1>;
+defm PAND : PDI_binop_rm<0xDB, "pand", and, v2i64, VR128, memopv2i64,
+                         i128mem, 1>;
+defm POR  : PDI_binop_rm<0xEB, "por" , or, v2i64, VR128, memopv2i64,
+                         i128mem, 1>;
+defm PXOR : PDI_binop_rm<0xEF, "pxor", xor, v2i64, VR128, memopv2i64,
+                         i128mem, 1>;
 
 let ExeDomain = SSEPackedInt in {
   let neverHasSideEffects = 1 in {
@@ -3822,51 +3801,51 @@ let ExeDomain = SSEPackedInt in {
 
 let Predicates = [HasAVX] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
-            (v2i64 (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+            (VPSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
-            (v2i64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
-            (v2i64 (VPSLLDQri VR128:$src1, imm:$src2))>;
+            (VPSLLDQri VR128:$src1, imm:$src2)>;
   def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
-            (v2i64 (VPSRLDQri VR128:$src1, imm:$src2))>;
+            (VPSRLDQri VR128:$src1, imm:$src2)>;
   def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
-            (v2f64 (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+            (VPSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
 
   // Shift up / down and insert zero's.
   def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
-            (v2i64 (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+            (VPSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
   def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
-            (v2i64 (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+            (VPSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
 }
 
 let Predicates = [HasAVX2] in {
   def : Pat<(int_x86_avx2_psll_dq VR256:$src1, imm:$src2),
-            (v4i64 (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2)))>;
+            (VPSLLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_avx2_psrl_dq VR256:$src1, imm:$src2),
-            (v4i64 (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2)))>;
+            (VPSRLDQYri VR256:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_avx2_psll_dq_bs VR256:$src1, imm:$src2),
-            (v4i64 (VPSLLDQYri VR256:$src1, imm:$src2))>;
+            (VPSLLDQYri VR256:$src1, imm:$src2)>;
   def : Pat<(int_x86_avx2_psrl_dq_bs VR256:$src1, imm:$src2),
-            (v4i64 (VPSRLDQYri VR256:$src1, imm:$src2))>;
+            (VPSRLDQYri VR256:$src1, imm:$src2)>;
 }
 
 let Predicates = [HasSSE2] in {
   def : Pat<(int_x86_sse2_psll_dq VR128:$src1, imm:$src2),
-            (v2i64 (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+            (PSLLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psrl_dq VR128:$src1, imm:$src2),
-            (v2i64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
   def : Pat<(int_x86_sse2_psll_dq_bs VR128:$src1, imm:$src2),
-            (v2i64 (PSLLDQri VR128:$src1, imm:$src2))>;
+            (PSLLDQri VR128:$src1, imm:$src2)>;
   def : Pat<(int_x86_sse2_psrl_dq_bs VR128:$src1, imm:$src2),
-            (v2i64 (PSRLDQri VR128:$src1, imm:$src2))>;
+            (PSRLDQri VR128:$src1, imm:$src2)>;
   def : Pat<(v2f64 (X86fsrl VR128:$src1, i32immSExt8:$src2)),
-            (v2f64 (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2)))>;
+            (PSRLDQri VR128:$src1, (BYTE_imm imm:$src2))>;
 
   // Shift up / down and insert zero's.
   def : Pat<(v2i64 (X86vshl  VR128:$src, (i8 imm:$amt))),
-            (v2i64 (PSLLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+            (PSLLDQri VR128:$src, (BYTE_imm imm:$amt))>;
   def : Pat<(v2i64 (X86vshr  VR128:$src, (i8 imm:$amt))),
-            (v2i64 (PSRLDQri VR128:$src, (BYTE_imm imm:$amt)))>;
+            (PSRLDQri VR128:$src, (BYTE_imm imm:$amt))>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -3889,28 +3868,34 @@ let Predicates = [HasAVX] in {
 
   def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
             (VPCMPEQBrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1,
+                    (bc_v16i8 (memopv2i64 addr:$src2)))),
             (VPCMPEQBrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
             (VPCMPEQWrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1,
+                    (bc_v8i16 (memopv2i64 addr:$src2)))),
             (VPCMPEQWrm VR128:$src1, addr:$src2)>;
   def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
             (VPCMPEQDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))),
             (VPCMPEQDrm VR128:$src1, addr:$src2)>;
 
   def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
             (VPCMPGTBrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1,
+                    (bc_v16i8 (memopv2i64 addr:$src2)))),
             (VPCMPGTBrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
             (VPCMPGTWrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1,
+                    (bc_v8i16 (memopv2i64 addr:$src2)))),
             (VPCMPGTWrm VR128:$src1, addr:$src2)>;
   def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
             (VPCMPGTDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))),
             (VPCMPGTDrm VR128:$src1, addr:$src2)>;
 }
 
@@ -3930,28 +3915,34 @@ let Predicates = [HasAVX2] in {
 
   def : Pat<(v32i8 (X86pcmpeqb VR256:$src1, VR256:$src2)),
             (VPCMPEQBYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v32i8 (X86pcmpeqb VR256:$src1, (memop addr:$src2))),
+  def : Pat<(v32i8 (X86pcmpeqb VR256:$src1,
+                    (bc_v32i8 (memopv4i64 addr:$src2)))),
             (VPCMPEQBYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v16i16 (X86pcmpeqw VR256:$src1, VR256:$src2)),
             (VPCMPEQWYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v16i16 (X86pcmpeqw VR256:$src1, (memop addr:$src2))),
+  def : Pat<(v16i16 (X86pcmpeqw VR256:$src1,
+                     (bc_v16i16 (memopv4i64 addr:$src2)))),
             (VPCMPEQWYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86pcmpeqd VR256:$src1, VR256:$src2)),
             (VPCMPEQDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86pcmpeqd VR256:$src1, (memop addr:$src2))),
+  def : Pat<(v8i32 (X86pcmpeqd VR256:$src1,
+                    (bc_v8i32 (memopv4i64 addr:$src2)))),
             (VPCMPEQDYrm VR256:$src1, addr:$src2)>;
 
   def : Pat<(v32i8 (X86pcmpgtb VR256:$src1, VR256:$src2)),
             (VPCMPGTBYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v32i8 (X86pcmpgtb VR256:$src1, (memop addr:$src2))),
+  def : Pat<(v32i8 (X86pcmpgtb VR256:$src1,
+                    (bc_v32i8 (memopv4i64 addr:$src2)))),
             (VPCMPGTBYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v16i16 (X86pcmpgtw VR256:$src1, VR256:$src2)),
             (VPCMPGTWYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v16i16 (X86pcmpgtw VR256:$src1, (memop addr:$src2))),
+  def : Pat<(v16i16 (X86pcmpgtw VR256:$src1,
+                     (bc_v16i16 (memopv4i64 addr:$src2)))),
             (VPCMPGTWYrm VR256:$src1, addr:$src2)>;
   def : Pat<(v8i32 (X86pcmpgtd VR256:$src1, VR256:$src2)),
             (VPCMPGTDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (X86pcmpgtd VR256:$src1, (memop addr:$src2))),
+  def : Pat<(v8i32 (X86pcmpgtd VR256:$src1,
+                    (bc_v8i32 (memopv4i64 addr:$src2)))),
             (VPCMPGTDYrm VR256:$src1, addr:$src2)>;
 }
 
@@ -3973,28 +3964,34 @@ let Constraints = "$src1 = $dst" in {
 let Predicates = [HasSSE2] in {
   def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, VR128:$src2)),
             (PCMPEQBrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v16i8 (X86pcmpeqb VR128:$src1,
+                    (bc_v16i8 (memopv2i64 addr:$src2)))),
             (PCMPEQBrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, VR128:$src2)),
             (PCMPEQWrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v8i16 (X86pcmpeqw VR128:$src1,
+                    (bc_v8i16 (memopv2i64 addr:$src2)))),
             (PCMPEQWrm VR128:$src1, addr:$src2)>;
   def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, VR128:$src2)),
             (PCMPEQDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v4i32 (X86pcmpeqd VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))),
             (PCMPEQDrm VR128:$src1, addr:$src2)>;
 
   def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, VR128:$src2)),
             (PCMPGTBrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v16i8 (X86pcmpgtb VR128:$src1,
+             (bc_v16i8 (memopv2i64 addr:$src2)))),
             (PCMPGTBrm VR128:$src1, addr:$src2)>;
   def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, VR128:$src2)),
             (PCMPGTWrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v8i16 (X86pcmpgtw VR128:$src1,
+                    (bc_v8i16 (memopv2i64 addr:$src2)))),
             (PCMPGTWrm VR128:$src1, addr:$src2)>;
   def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, VR128:$src2)),
             (PCMPGTDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1, (memop addr:$src2))),
+  def : Pat<(v4i32 (X86pcmpgtd VR128:$src1,
+                    (bc_v4i32 (memopv2i64 addr:$src2)))),
             (PCMPGTDrm VR128:$src1, addr:$src2)>;
 }
 
@@ -4207,19 +4204,8 @@ let Predicates = [HasAVX] in {
                                  bc_v8i16, 0>, VEX_4V;
   defm VPUNPCKLDQ  : sse2_unpack<0x62, "vpunpckldq", v4i32, X86Punpckldq,
                                  bc_v4i32, 0>, VEX_4V;
-
-  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
-  /// knew to collapse (bitconvert VT to VT) into its operand.
-  def VPUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
-            (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
-                                                    VR128:$src2)))]>, VEX_4V;
-  def VPUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
-            (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-            [(set VR128:$dst, (v2i64 (X86Punpcklqdq VR128:$src1,
-                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack<0x6C, "vpunpcklqdq", v2i64, X86Punpcklqdq,
+                                 bc_v2i64, 0>, VEX_4V;
 
   defm VPUNPCKHBW  : sse2_unpack<0x68, "vpunpckhbw", v16i8, X86Punpckhbw,
                                  bc_v16i8, 0>, VEX_4V;
@@ -4227,19 +4213,8 @@ let Predicates = [HasAVX] in {
                                  bc_v8i16, 0>, VEX_4V;
   defm VPUNPCKHDQ  : sse2_unpack<0x6A, "vpunpckhdq", v4i32, X86Punpckhdq,
                                  bc_v4i32, 0>, VEX_4V;
-
-  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
-  /// knew to collapse (bitconvert VT to VT) into its operand.
-  def VPUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
-             (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
-                                                     VR128:$src2)))]>, VEX_4V;
-  def VPUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
-             (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-             [(set VR128:$dst, (v2i64 (X86Punpckhqdq VR128:$src1,
-                                        (memopv2i64 addr:$src2))))]>, VEX_4V;
+  defm VPUNPCKHQDQ : sse2_unpack<0x6D, "vpunpckhqdq", v2i64, X86Punpckhqdq,
+                                 bc_v2i64, 0>, VEX_4V;
 }
 
 let Predicates = [HasAVX2] in {
@@ -4249,19 +4224,8 @@ let Predicates = [HasAVX2] in {
                                    bc_v16i16>, VEX_4V;
   defm VPUNPCKLDQ  : sse2_unpack_y<0x62, "vpunpckldq", v8i32, X86Punpckldq,
                                    bc_v8i32>, VEX_4V;
-
-  /// FIXME: we could eliminate this and use sse2_unpack_y instead if tblgen
-  /// knew to collapse (bitconvert VT to VT) into its operand.
-  def VPUNPCKLQDQYrr : PDI<0x6C, MRMSrcReg,
-            (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
-            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-            [(set VR256:$dst, (v4i64 (X86Punpcklqdq VR256:$src1,
-                                                    VR256:$src2)))]>, VEX_4V;
-  def VPUNPCKLQDQYrm : PDI<0x6C, MRMSrcMem,
-            (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
-            "vpunpcklqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-            [(set VR256:$dst, (v4i64 (X86Punpcklqdq VR256:$src1,
-                                        (memopv4i64 addr:$src2))))]>, VEX_4V;
+  defm VPUNPCKLQDQ : sse2_unpack_y<0x6C, "vpunpcklqdq", v4i64, X86Punpcklqdq,
+                                   bc_v4i64>, VEX_4V;
 
   defm VPUNPCKHBW  : sse2_unpack_y<0x68, "vpunpckhbw", v32i8, X86Punpckhbw,
                                    bc_v32i8>, VEX_4V;
@@ -4269,57 +4233,28 @@ let Predicates = [HasAVX2] in {
                                    bc_v16i16>, VEX_4V;
   defm VPUNPCKHDQ  : sse2_unpack_y<0x6A, "vpunpckhdq", v8i32, X86Punpckhdq,
                                    bc_v8i32>, VEX_4V;
-
-  /// FIXME: we could eliminate this and use sse2_unpack_y instead if tblgen
-  /// knew to collapse (bitconvert VT to VT) into its operand.
-  def VPUNPCKHQDQYrr : PDI<0x6D, MRMSrcReg,
-             (outs VR256:$dst), (ins VR256:$src1, VR256:$src2),
-             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-             [(set VR256:$dst, (v4i64 (X86Punpckhqdq VR256:$src1,
-                                                     VR256:$src2)))]>, VEX_4V;
-  def VPUNPCKHQDQYrm : PDI<0x6D, MRMSrcMem,
-             (outs VR256:$dst), (ins VR256:$src1, i256mem:$src2),
-             "vpunpckhqdq\t{$src2, $src1, $dst|$dst, $src1, $src2}",
-             [(set VR256:$dst, (v4i64 (X86Punpckhqdq VR256:$src1,
-                                        (memopv4i64 addr:$src2))))]>, VEX_4V;
+  defm VPUNPCKHQDQ : sse2_unpack_y<0x6D, "vpunpckhqdq", v4i64, X86Punpckhqdq,
+                                   bc_v4i64>, VEX_4V;
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw, bc_v16i8>;
-  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd, bc_v8i16>;
-  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq, bc_v4i32>;
-
-  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
-  /// knew to collapse (bitconvert VT to VT) into its operand.
-  def PUNPCKLQDQrr : PDI<0x6C, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (X86Punpcklqdq VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKLQDQrm : PDI<0x6C, MRMSrcMem,
-                         (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                         "punpcklqdq\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (X86Punpcklqdq VR128:$src1,
-                                         (memopv2i64 addr:$src2))))]>;
-
-  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw, bc_v16i8>;
-  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd, bc_v8i16>;
-  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq, bc_v4i32>;
-
-  /// FIXME: we could eliminate this and use sse2_unpack instead if tblgen
-  /// knew to collapse (bitconvert VT to VT) into its operand.
-  def PUNPCKHQDQrr : PDI<0x6D, MRMSrcReg,
-                         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2),
-                         "punpckhqdq\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (X86Punpckhqdq VR128:$src1, VR128:$src2)))]>;
-  def PUNPCKHQDQrm : PDI<0x6D, MRMSrcMem,
-                        (outs VR128:$dst), (ins VR128:$src1, i128mem:$src2),
-                        "punpckhqdq\t{$src2, $dst|$dst, $src2}",
-                        [(set VR128:$dst,
-                          (v2i64 (X86Punpckhqdq VR128:$src1,
-                                         (memopv2i64 addr:$src2))))]>;
+  defm PUNPCKLBW  : sse2_unpack<0x60, "punpcklbw", v16i8, X86Punpcklbw,
+                                bc_v16i8>;
+  defm PUNPCKLWD  : sse2_unpack<0x61, "punpcklwd", v8i16, X86Punpcklwd,
+                                bc_v8i16>;
+  defm PUNPCKLDQ  : sse2_unpack<0x62, "punpckldq", v4i32, X86Punpckldq,
+                                bc_v4i32>;
+  defm PUNPCKLQDQ : sse2_unpack<0x6C, "punpcklqdq", v2i64, X86Punpcklqdq,
+                                bc_v2i64>;
+
+  defm PUNPCKHBW  : sse2_unpack<0x68, "punpckhbw", v16i8, X86Punpckhbw,
+                                bc_v16i8>;
+  defm PUNPCKHWD  : sse2_unpack<0x69, "punpckhwd", v8i16, X86Punpckhwd,
+                                bc_v8i16>;
+  defm PUNPCKHDQ  : sse2_unpack<0x6A, "punpckhdq", v4i32, X86Punpckhdq,
+                                bc_v4i32>;
+  defm PUNPCKHQDQ : sse2_unpack<0x6D, "punpckhqdq", v2i64, X86Punpckhqdq,
+                                bc_v2i64>;
 }
 } // ExeDomain = SSEPackedInt
 
@@ -5052,21 +4987,25 @@ multiclass sse3_addsub<Intrinsic Int, string OpcodeStr, RegisterClass RC,
        [(set RC:$dst, (Int RC:$src1, (memop addr:$src2)))]>;
 }
 
-let Predicates = [HasAVX],
-  ExeDomain = SSEPackedDouble in {
-  defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
-                               f128mem, 0>, TB, XD, VEX_4V;
-  defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
-                               f128mem, 0>, TB, OpSize, VEX_4V;
-  defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
-                               f256mem, 0>, TB, XD, VEX_4V;
-  defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
-                               f256mem, 0>, TB, OpSize, VEX_4V;
-}
-let Constraints = "$src1 = $dst", Predicates = [HasSSE3],
-    ExeDomain = SSEPackedDouble in {
+let Predicates = [HasAVX] in {
+  let ExeDomain = SSEPackedSingle in {
+    defm VADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "vaddsubps", VR128,
+                                 f128mem, 0>, TB, XD, VEX_4V;
+    defm VADDSUBPSY : sse3_addsub<int_x86_avx_addsub_ps_256, "vaddsubps", VR256,
+                                 f256mem, 0>, TB, XD, VEX_4V;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "vaddsubpd", VR128,
+                                 f128mem, 0>, TB, OpSize, VEX_4V;
+    defm VADDSUBPDY : sse3_addsub<int_x86_avx_addsub_pd_256, "vaddsubpd", VR256,
+                                 f256mem, 0>, TB, OpSize, VEX_4V;
+  }
+}
+let Constraints = "$src1 = $dst", Predicates = [HasSSE3] in {
+  let ExeDomain = SSEPackedSingle in
   defm ADDSUBPS : sse3_addsub<int_x86_sse3_addsub_ps, "addsubps", VR128,
                               f128mem>, TB, XD;
+  let ExeDomain = SSEPackedDouble in
   defm ADDSUBPD : sse3_addsub<int_x86_sse3_addsub_pd, "addsubpd", VR128,
                               f128mem>, TB, OpSize;
 }
@@ -5106,29 +5045,37 @@ multiclass S3_Int<bits<8> o, string OpcodeStr, ValueType vt, RegisterClass RC,
 }
 
 let Predicates = [HasAVX] in {
-  defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
-                          X86fhadd, 0>, VEX_4V;
-  defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
-                          X86fhadd, 0>, VEX_4V;
-  defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
-                          X86fhsub, 0>, VEX_4V;
-  defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
-                          X86fhsub, 0>, VEX_4V;
-  defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
-                          X86fhadd, 0>, VEX_4V;
-  defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
-                          X86fhadd, 0>, VEX_4V;
-  defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
-                          X86fhsub, 0>, VEX_4V;
-  defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
-                          X86fhsub, 0>, VEX_4V;
+  let ExeDomain = SSEPackedSingle in {
+    defm VHADDPS  : S3D_Int<0x7C, "vhaddps", v4f32, VR128, f128mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPS  : S3D_Int<0x7D, "vhsubps", v4f32, VR128, f128mem,
+                            X86fhsub, 0>, VEX_4V;
+    defm VHADDPSY : S3D_Int<0x7C, "vhaddps", v8f32, VR256, f256mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPSY : S3D_Int<0x7D, "vhsubps", v8f32, VR256, f256mem,
+                            X86fhsub, 0>, VEX_4V;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm VHADDPD  : S3_Int <0x7C, "vhaddpd", v2f64, VR128, f128mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPD  : S3_Int <0x7D, "vhsubpd", v2f64, VR128, f128mem,
+                            X86fhsub, 0>, VEX_4V;
+    defm VHADDPDY : S3_Int <0x7C, "vhaddpd", v4f64, VR256, f256mem,
+                            X86fhadd, 0>, VEX_4V;
+    defm VHSUBPDY : S3_Int <0x7D, "vhsubpd", v4f64, VR256, f256mem,
+                            X86fhsub, 0>, VEX_4V;
+  }
 }
 
 let Constraints = "$src1 = $dst" in {
-  defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
-  defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
-  defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
-  defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+  let ExeDomain = SSEPackedSingle in {
+    defm HADDPS : S3D_Int<0x7C, "haddps", v4f32, VR128, f128mem, X86fhadd>;
+    defm HSUBPS : S3D_Int<0x7D, "hsubps", v4f32, VR128, f128mem, X86fhsub>;
+  }
+  let ExeDomain = SSEPackedDouble in {
+    defm HADDPD : S3_Int<0x7C, "haddpd", v2f64, VR128, f128mem, X86fhadd>;
+    defm HSUBPD : S3_Int<0x7D, "hsubpd", v2f64, VR128, f128mem, X86fhsub>;
+  }
 }
 
 //===---------------------------------------------------------------------===//
@@ -5284,11 +5231,11 @@ let isCommutable = 0 in {
                                         int_x86_avx2_pmadd_ub_sw>, VEX_4V;
   defm VPSHUFB    : SS3I_binop_rm_int_y<0x00, "vpshufb", memopv32i8,
                                         int_x86_avx2_pshuf_b>, VEX_4V;
-  defm VPSIGNB    : SS3I_binop_rm_int_y<0x08, "vpsignb", memopv16i8,
+  defm VPSIGNB    : SS3I_binop_rm_int_y<0x08, "vpsignb", memopv32i8,
                                         int_x86_avx2_psign_b>, VEX_4V;
-  defm VPSIGNW    : SS3I_binop_rm_int_y<0x09, "vpsignw", memopv8i16,
+  defm VPSIGNW    : SS3I_binop_rm_int_y<0x09, "vpsignw", memopv16i16,
                                         int_x86_avx2_psign_w>, VEX_4V;
-  defm VPSIGND    : SS3I_binop_rm_int_y<0x0A, "vpsignd", memopv4i32,
+  defm VPSIGND    : SS3I_binop_rm_int_y<0x0A, "vpsignd", memopv8i32,
                                         int_x86_avx2_psign_d>, VEX_4V;
 }
 defm VPMULHRSW    : SS3I_binop_rm_int_y<0x0B, "vpmulhrsw", memopv16i16,
@@ -5331,12 +5278,21 @@ let Predicates = [HasSSSE3] in {
   def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
             (PSHUFBrm128 VR128:$src, addr:$mask)>;
 
-  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
+  def : Pat<(v16i8 (X86psign VR128:$src1, VR128:$src2)),
             (PSIGNBrr128 VR128:$src1, VR128:$src2)>;
-  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
+  def : Pat<(v8i16 (X86psign VR128:$src1, VR128:$src2)),
             (PSIGNWrr128 VR128:$src1, VR128:$src2)>;
-  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
+  def : Pat<(v4i32 (X86psign VR128:$src1, VR128:$src2)),
             (PSIGNDrr128 VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v8i16 (X86hadd VR128:$src1, VR128:$src2)),
+            (PHADDWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86hadd VR128:$src1, VR128:$src2)),
+            (PHADDDrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86hsub VR128:$src1, VR128:$src2)),
+            (PHSUBWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86hsub VR128:$src1, VR128:$src2)),
+            (PHSUBDrr128 VR128:$src1, VR128:$src2)>;
 }
 
 let Predicates = [HasAVX] in {
@@ -5345,12 +5301,39 @@ let Predicates = [HasAVX] in {
   def : Pat<(X86pshufb VR128:$src, (bc_v16i8 (memopv2i64 addr:$mask))),
             (VPSHUFBrm128 VR128:$src, addr:$mask)>;
 
-  def : Pat<(X86psignb VR128:$src1, VR128:$src2),
+  def : Pat<(v16i8 (X86psign VR128:$src1, VR128:$src2)),
             (VPSIGNBrr128 VR128:$src1, VR128:$src2)>;
-  def : Pat<(X86psignw VR128:$src1, VR128:$src2),
+  def : Pat<(v8i16 (X86psign VR128:$src1, VR128:$src2)),
             (VPSIGNWrr128 VR128:$src1, VR128:$src2)>;
-  def : Pat<(X86psignd VR128:$src1, VR128:$src2),
+  def : Pat<(v4i32 (X86psign VR128:$src1, VR128:$src2)),
             (VPSIGNDrr128 VR128:$src1, VR128:$src2)>;
+
+  def : Pat<(v8i16 (X86hadd VR128:$src1, VR128:$src2)),
+            (VPHADDWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86hadd VR128:$src1, VR128:$src2)),
+            (VPHADDDrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(v8i16 (X86hsub VR128:$src1, VR128:$src2)),
+            (VPHSUBWrr128 VR128:$src1, VR128:$src2)>;
+  def : Pat<(v4i32 (X86hsub VR128:$src1, VR128:$src2)),
+            (VPHSUBDrr128 VR128:$src1, VR128:$src2)>;
+}
+
+let Predicates = [HasAVX2] in {
+  def : Pat<(v32i8 (X86psign VR256:$src1, VR256:$src2)),
+            (VPSIGNBrr256 VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (X86psign VR256:$src1, VR256:$src2)),
+            (VPSIGNWrr256 VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86psign VR256:$src1, VR256:$src2)),
+            (VPSIGNDrr256 VR256:$src1, VR256:$src2)>;
+
+  def : Pat<(v16i16 (X86hadd VR256:$src1, VR256:$src2)),
+            (VPHADDWrr256 VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86hadd VR256:$src1, VR256:$src2)),
+            (VPHADDDrr256 VR256:$src1, VR256:$src2)>;
+  def : Pat<(v16i16 (X86hsub VR256:$src1, VR256:$src2)),
+            (VPHSUBWrr256 VR256:$src1, VR256:$src2)>;
+  def : Pat<(v8i32 (X86hsub VR256:$src1, VR256:$src2)),
+            (VPHSUBDrr256 VR256:$src1, VR256:$src2)>;
 }
 
 //===---------------------------------------------------------------------===//
@@ -5837,14 +5820,16 @@ multiclass SS41I_extractf32<bits<8> opc, string OpcodeStr> {
                           addr:$dst)]>, OpSize;
 }
 
-let Predicates = [HasAVX] in {
-  defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
-  def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
-                  (ins VR128:$src1, i32i8imm:$src2),
-                  "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
-                  []>, OpSize, VEX;
+let ExeDomain = SSEPackedSingle in {
+  let Predicates = [HasAVX] in {
+    defm VEXTRACTPS : SS41I_extractf32<0x17, "vextractps">, VEX;
+    def VEXTRACTPSrr64 : SS4AIi8<0x17, MRMDestReg, (outs GR64:$dst),
+                    (ins VR128:$src1, i32i8imm:$src2),
+                    "vextractps \t{$src2, $src1, $dst|$dst, $src1, $src2}",
+                    []>, OpSize, VEX;
+  }
+  defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
 }
-defm EXTRACTPS   : SS41I_extractf32<0x17, "extractps">;
 
 // Also match an EXTRACTPS store when the store is done as f32 instead of i32.
 def : Pat<(store (f32 (bitconvert (extractelt (bc_v4i32 (v4f32 VR128:$src1)),
@@ -5965,10 +5950,12 @@ multiclass SS41I_insertf32<bits<8> opc, string asm, bit Is2Addr = 1> {
                     imm:$src3))]>, OpSize;
 }
 
-let Constraints = "$src1 = $dst" in
-  defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
-let Predicates = [HasAVX] in
-  defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+let ExeDomain = SSEPackedSingle in {
+  let Constraints = "$src1 = $dst" in
+    defm INSERTPS : SS41I_insertf32<0x21, "insertps">;
+  let Predicates = [HasAVX] in
+    defm VINSERTPS : SS41I_insertf32<0x21, "vinsertps", 0>, VEX_4V;
+}
 
 def : Pat<(int_x86_sse41_insertps VR128:$src1, VR128:$src2, imm:$src3),
           (VINSERTPSrr VR128:$src1, VR128:$src2, imm:$src3)>,
@@ -5985,6 +5972,7 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
                             X86MemOperand x86memop, RegisterClass RC,
                             PatFrag mem_frag32, PatFrag mem_frag64,
                             Intrinsic V4F32Int, Intrinsic V2F64Int> {
+let ExeDomain = SSEPackedSingle in {
   // Intrinsic operation, reg.
   // Vector intrinsic operation, reg
   def PSr : SS4AIi8<opcps, MRMSrcReg,
@@ -5995,15 +5983,16 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
                     OpSize;
 
   // Vector intrinsic operation, mem
-  def PSm : Ii8<opcps, MRMSrcMem,
+  def PSm : SS4AIi8<opcps, MRMSrcMem,
                     (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
                     !strconcat(OpcodeStr,
                     "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
                     [(set RC:$dst,
                           (V4F32Int (mem_frag32 addr:$src1),imm:$src2))]>,
-                    TA, OpSize,
-                Requires<[HasSSE41]>;
+                    OpSize;
+} // ExeDomain = SSEPackedSingle
 
+let ExeDomain = SSEPackedDouble in {
   // Vector intrinsic operation, reg
   def PDr : SS4AIi8<opcpd, MRMSrcReg,
                     (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
@@ -6020,44 +6009,14 @@ multiclass sse41_fp_unop_rm<bits<8> opcps, bits<8> opcpd, string OpcodeStr,
                     [(set RC:$dst,
                           (V2F64Int (mem_frag64 addr:$src1),imm:$src2))]>,
                     OpSize;
-}
-
-multiclass sse41_fp_unop_rm_avx_p<bits<8> opcps, bits<8> opcpd,
-                   RegisterClass RC, X86MemOperand x86memop, string OpcodeStr> {
-  // Intrinsic operation, reg.
-  // Vector intrinsic operation, reg
-  def PSr_AVX : SS4AIi8<opcps, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, OpSize;
-
-  // Vector intrinsic operation, mem
-  def PSm_AVX : Ii8<opcps, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "ps\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, TA, OpSize, Requires<[HasSSE41]>;
-
-  // Vector intrinsic operation, reg
-  def PDr_AVX : SS4AIi8<opcpd, MRMSrcReg,
-                    (outs RC:$dst), (ins RC:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, OpSize;
-
-  // Vector intrinsic operation, mem
-  def PDm_AVX : SS4AIi8<opcpd, MRMSrcMem,
-                    (outs RC:$dst), (ins x86memop:$src1, i32i8imm:$src2),
-                    !strconcat(OpcodeStr,
-                    "pd\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-                    []>, OpSize;
+} // ExeDomain = SSEPackedDouble
 }
 
 multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
                             string OpcodeStr,
                             Intrinsic F32Int,
                             Intrinsic F64Int, bit Is2Addr = 1> {
+let ExeDomain = GenericDomain in {
   // Intrinsic operation, reg.
   def SSr : SS4AIi8<opcss, MRMSrcReg,
         (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
@@ -6103,37 +6062,7 @@ multiclass sse41_fp_binop_rm<bits<8> opcss, bits<8> opcsd,
         [(set VR128:$dst,
               (F64Int VR128:$src1, sse_load_f64:$src2, imm:$src3))]>,
         OpSize;
-}
-
-multiclass sse41_fp_binop_rm_avx_s<bits<8> opcss, bits<8> opcsd,
-                                   string OpcodeStr> {
-  // Intrinsic operation, reg.
-  def SSr_AVX : SS4AIi8<opcss, MRMSrcReg,
-        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-        !strconcat(OpcodeStr,
-                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, OpSize;
-
-  // Intrinsic operation, mem.
-  def SSm_AVX : SS4AIi8<opcss, MRMSrcMem,
-        (outs VR128:$dst), (ins VR128:$src1, ssmem:$src2, i32i8imm:$src3),
-        !strconcat(OpcodeStr,
-                "ss\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, OpSize;
-
-  // Intrinsic operation, reg.
-  def SDr_AVX : SS4AIi8<opcsd, MRMSrcReg,
-        (outs VR128:$dst), (ins VR128:$src1, VR128:$src2, i32i8imm:$src3),
-            !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, OpSize;
-
-  // Intrinsic operation, mem.
-  def SDm_AVX : SS4AIi8<opcsd, MRMSrcMem,
-        (outs VR128:$dst), (ins VR128:$src1, sdmem:$src2, i32i8imm:$src3),
-            !strconcat(OpcodeStr,
-                "sd\t{$src3, $src2, $src1, $dst|$dst, $src1, $src2, $src3}"),
-        []>, OpSize;
+} // ExeDomain = GenericDomain
 }
 
 // FP round - roundss, roundps, roundsd, roundpd
@@ -6150,13 +6079,6 @@ let Predicates = [HasAVX] in {
   defm VROUND  : sse41_fp_binop_rm<0x0A, 0x0B, "vround",
                                   int_x86_sse41_round_ss,
                                   int_x86_sse41_round_sd, 0>, VEX_4V, VEX_LIG;
-
-  // Instructions for the assembler
-  defm VROUND  : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR128, f128mem, "vround">,
-                                        VEX;
-  defm VROUNDY : sse41_fp_unop_rm_avx_p<0x08, 0x09, VR256, f256mem, "vround">,
-                                        VEX;
-  defm VROUND  : sse41_fp_binop_rm_avx_s<0x0A, 0x0B, "vround">, VEX_4V, VEX_LIG;
 }
 
 defm ROUND  : sse41_fp_unop_rm<0x08, 0x09, "round", f128mem, VR128,
@@ -6194,11 +6116,11 @@ def VPTESTYrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR256:$src1, i256mem:$src2),
 
 let Defs = [EFLAGS] in {
 def PTESTrr : SS48I<0x17, MRMSrcReg, (outs), (ins VR128:$src1, VR128:$src2),
-              "ptest \t{$src2, $src1|$src1, $src2}",
+              "ptest\t{$src2, $src1|$src1, $src2}",
               [(set EFLAGS, (X86ptest VR128:$src1, (v4f32 VR128:$src2)))]>,
               OpSize;
 def PTESTrm : SS48I<0x17, MRMSrcMem, (outs), (ins VR128:$src1, f128mem:$src2),
-              "ptest \t{$src2, $src1|$src1, $src2}",
+              "ptest\t{$src2, $src1|$src1, $src2}",
               [(set EFLAGS, (X86ptest VR128:$src1, (memopv4f32 addr:$src2)))]>,
               OpSize;
 }
@@ -6216,11 +6138,15 @@ multiclass avx_bittest<bits<8> opc, string OpcodeStr, RegisterClass RC,
 }
 
 let Defs = [EFLAGS], Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedSingle in {
 defm VTESTPS  : avx_bittest<0x0E, "vtestps", VR128, f128mem, memopv4f32, v4f32>;
 defm VTESTPSY : avx_bittest<0x0E, "vtestps", VR256, f256mem, memopv8f32, v8f32>;
+}
+let ExeDomain = SSEPackedDouble in {
 defm VTESTPD  : avx_bittest<0x0F, "vtestpd", VR128, f128mem, memopv2f64, v2f64>;
 defm VTESTPDY : avx_bittest<0x0F, "vtestpd", VR256, f256mem, memopv4f64, v4f64>;
 }
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.1 - Misc Instructions
@@ -6391,10 +6317,12 @@ let Constraints = "$src1 = $dst" in {
   defm PMULDQ   : SS41I_binop_rm_int<0x28, "pmuldq",   int_x86_sse41_pmuldq>;
 }
 
-def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
-          (PCMPEQQrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
-          (PCMPEQQrm VR128:$src1, addr:$src2)>;
+let Predicates = [HasSSE41] in {
+  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, VR128:$src2)),
+            (PCMPEQQrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86pcmpeqq VR128:$src1, (memop addr:$src2))),
+            (PCMPEQQrm VR128:$src1, addr:$src2)>;
+}
 
 /// SS48I_binop_rm - Simple SSE41 binary operator.
 multiclass SS48I_binop_rm<bits<8> opc, string OpcodeStr, SDNode OpNode,
@@ -6470,23 +6398,30 @@ multiclass SS41I_binop_rmi_int<bits<8> opc, string OpcodeStr,
 
 let Predicates = [HasAVX] in {
   let isCommutable = 0 in {
-  defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
-                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
-  defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
-                                      VR128, memopv16i8, i128mem, 0>, VEX_4V;
-  defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
-            int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
-  defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
-            int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
+    let ExeDomain = SSEPackedSingle in {
+    defm VBLENDPS : SS41I_binop_rmi_int<0x0C, "vblendps", int_x86_sse41_blendps,
+                                        VR128, memopv16i8, i128mem, 0>, VEX_4V;
+    defm VBLENDPSY : SS41I_binop_rmi_int<0x0C, "vblendps",
+              int_x86_avx_blend_ps_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
+    }
+    let ExeDomain = SSEPackedDouble in {
+    defm VBLENDPD : SS41I_binop_rmi_int<0x0D, "vblendpd", int_x86_sse41_blendpd,
+                                        VR128, memopv16i8, i128mem, 0>, VEX_4V;
+    defm VBLENDPDY : SS41I_binop_rmi_int<0x0D, "vblendpd",
+              int_x86_avx_blend_pd_256, VR256, memopv32i8, i256mem, 0>, VEX_4V;
+    }
   defm VPBLENDW : SS41I_binop_rmi_int<0x0E, "vpblendw", int_x86_sse41_pblendw,
                                       VR128, memopv16i8, i128mem, 0>, VEX_4V;
   defm VMPSADBW : SS41I_binop_rmi_int<0x42, "vmpsadbw", int_x86_sse41_mpsadbw,
                                       VR128, memopv16i8, i128mem, 0>, VEX_4V;
   }
+  let ExeDomain = SSEPackedSingle in
   defm VDPPS : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_sse41_dpps,
                                    VR128, memopv16i8, i128mem, 0>, VEX_4V;
+  let ExeDomain = SSEPackedDouble in
   defm VDPPD : SS41I_binop_rmi_int<0x41, "vdppd", int_x86_sse41_dppd,
                                    VR128, memopv16i8, i128mem, 0>, VEX_4V;
+  let ExeDomain = SSEPackedSingle in
   defm VDPPSY : SS41I_binop_rmi_int<0x40, "vdpps", int_x86_avx_dp_ps_256,
                                    VR256, memopv32i8, i256mem, 0>, VEX_4V;
 }
@@ -6502,8 +6437,10 @@ let Predicates = [HasAVX2] in {
 
 let Constraints = "$src1 = $dst" in {
   let isCommutable = 0 in {
+  let ExeDomain = SSEPackedSingle in
   defm BLENDPS : SS41I_binop_rmi_int<0x0C, "blendps", int_x86_sse41_blendps,
                                      VR128, memopv16i8, i128mem>;
+  let ExeDomain = SSEPackedDouble in
   defm BLENDPD : SS41I_binop_rmi_int<0x0D, "blendpd", int_x86_sse41_blendpd,
                                      VR128, memopv16i8, i128mem>;
   defm PBLENDW : SS41I_binop_rmi_int<0x0E, "pblendw", int_x86_sse41_pblendw,
@@ -6511,8 +6448,10 @@ let Constraints = "$src1 = $dst" in {
   defm MPSADBW : SS41I_binop_rmi_int<0x42, "mpsadbw", int_x86_sse41_mpsadbw,
                                      VR128, memopv16i8, i128mem>;
   }
+  let ExeDomain = SSEPackedSingle in
   defm DPPS : SS41I_binop_rmi_int<0x40, "dpps", int_x86_sse41_dpps,
                                   VR128, memopv16i8, i128mem>;
+  let ExeDomain = SSEPackedDouble in
   defm DPPD : SS41I_binop_rmi_int<0x41, "dppd", int_x86_sse41_dppd,
                                   VR128, memopv16i8, i128mem>;
 }
@@ -6539,16 +6478,20 @@ multiclass SS41I_quaternary_int_avx<bits<8> opc, string OpcodeStr,
 }
 
 let Predicates = [HasAVX] in {
+let ExeDomain = SSEPackedDouble in {
 defm VBLENDVPD  : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR128, i128mem,
                                            memopv16i8, int_x86_sse41_blendvpd>;
-defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
-                                           memopv16i8, int_x86_sse41_blendvps>;
-defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
-                                           memopv16i8, int_x86_sse41_pblendvb>;
 defm VBLENDVPDY : SS41I_quaternary_int_avx<0x4B, "vblendvpd", VR256, i256mem,
                                          memopv32i8, int_x86_avx_blendv_pd_256>;
+} // ExeDomain = SSEPackedDouble
+let ExeDomain = SSEPackedSingle in {
+defm VBLENDVPS  : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR128, i128mem,
+                                           memopv16i8, int_x86_sse41_blendvps>;
 defm VBLENDVPSY : SS41I_quaternary_int_avx<0x4A, "vblendvps", VR256, i256mem,
                                          memopv32i8, int_x86_avx_blendv_ps_256>;
+} // ExeDomain = SSEPackedSingle
+defm VPBLENDVB  : SS41I_quaternary_int_avx<0x4C, "vpblendvb", VR128, i128mem,
+                                           memopv16i8, int_x86_sse41_pblendvb>;
 }
 
 let Predicates = [HasAVX2] in {
@@ -6612,7 +6555,9 @@ let Uses = [XMM0], Constraints = "$src1 = $dst" in {
   }
 }
 
+let ExeDomain = SSEPackedDouble in
 defm BLENDVPD : SS41I_ternary_int<0x15, "blendvpd", int_x86_sse41_blendvpd>;
+let ExeDomain = SSEPackedSingle in
 defm BLENDVPS : SS41I_ternary_int<0x14, "blendvps", int_x86_sse41_blendvps>;
 defm PBLENDVB : SS41I_ternary_int<0x10, "pblendvb", int_x86_sse41_pblendvb>;
 
@@ -6712,10 +6657,12 @@ let Predicates = [HasAVX2] in {
 let Constraints = "$src1 = $dst" in
   defm PCMPGTQ : SS42I_binop_rm_int<0x37, "pcmpgtq", int_x86_sse42_pcmpgtq>;
 
-def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
-          (PCMPGTQrr VR128:$src1, VR128:$src2)>;
-def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
-          (PCMPGTQrm VR128:$src1, addr:$src2)>;
+let Predicates = [HasSSE42] in {
+  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, VR128:$src2)),
+            (PCMPGTQrr VR128:$src1, VR128:$src2)>;
+  def : Pat<(v2i64 (X86pcmpgtq VR128:$src1, (memop addr:$src2))),
+            (PCMPGTQrm VR128:$src1, addr:$src2)>;
+}
 
 //===----------------------------------------------------------------------===//
 // SSE4.2 - String/text Processing Instructions
@@ -7164,21 +7111,27 @@ class avx2_broadcast_reg<bits<8> opc, string OpcodeStr, RegisterClass RC,
          !strconcat(OpcodeStr, "\t{$src, $dst|$dst, $src}"),
          [(set RC:$dst, (Int VR128:$src))]>, VEX;
 
-def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
-                                    int_x86_avx_vbroadcast_ss>;
-def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
-                                    int_x86_avx_vbroadcast_ss_256>;
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrm  : avx_broadcast<0x18, "vbroadcastss", VR128, f32mem,
+                                      int_x86_avx_vbroadcast_ss>;
+  def VBROADCASTSSYrm : avx_broadcast<0x18, "vbroadcastss", VR256, f32mem,
+                                      int_x86_avx_vbroadcast_ss_256>;
+}
+let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDrm  : avx_broadcast<0x19, "vbroadcastsd", VR256, f64mem,
                                     int_x86_avx_vbroadcast_sd_256>;
 def VBROADCASTF128 : avx_broadcast<0x1A, "vbroadcastf128", VR256, f128mem,
                                    int_x86_avx_vbroadcastf128_pd_256>;
 
-def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
-                                        int_x86_avx2_vbroadcast_ss_ps>;
-def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
-                                        int_x86_avx2_vbroadcast_ss_ps_256>;
+let ExeDomain = SSEPackedSingle in {
+  def VBROADCASTSSrr  : avx2_broadcast_reg<0x18, "vbroadcastss", VR128,
+                                           int_x86_avx2_vbroadcast_ss_ps>;
+  def VBROADCASTSSYrr : avx2_broadcast_reg<0x18, "vbroadcastss", VR256,
+                                           int_x86_avx2_vbroadcast_ss_ps_256>;
+}
+let ExeDomain = SSEPackedDouble in
 def VBROADCASTSDrr  : avx2_broadcast_reg<0x19, "vbroadcastsd", VR256,
-                                        int_x86_avx2_vbroadcast_sd_pd_256>;
+                                         int_x86_avx2_vbroadcast_sd_pd_256>;
 
 let Predicates = [HasAVX2] in
 def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
@@ -7187,19 +7140,6 @@ def VBROADCASTI128 : avx_broadcast<0x5A, "vbroadcasti128", VR256, i128mem,
 def : Pat<(int_x86_avx_vbroadcastf128_ps_256 addr:$src),
           (VBROADCASTF128 addr:$src)>;
 
-def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
-          (VBROADCASTSDrm addr:$src)>;
-def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSYrm addr:$src)>;
-def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
-          (VBROADCASTSDrm addr:$src)>;
-
-def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
-def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
-          (VBROADCASTSSrm addr:$src)>;
 
 //===----------------------------------------------------------------------===//
 // VINSERTF128 - Insert packed floating-point values
@@ -7300,8 +7240,7 @@ def : Pat<(vextractf128_extract:$ext VR256:$src1, (i32 imm)),
 //
 multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
                           Intrinsic IntLd, Intrinsic IntLd256,
-                          Intrinsic IntSt, Intrinsic IntSt256,
-                          PatFrag pf128, PatFrag pf256> {
+                          Intrinsic IntSt, Intrinsic IntSt256> {
   def rm  : AVX8I<opc_rm, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, f128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7322,18 +7261,18 @@ multiclass avx_movmask_rm<bits<8> opc_rm, bits<8> opc_mr, string OpcodeStr,
              [(IntSt256 addr:$dst, VR256:$src1, VR256:$src2)]>, VEX_4V;
 }
 
+let ExeDomain = SSEPackedSingle in
 defm VMASKMOVPS : avx_movmask_rm<0x2C, 0x2E, "vmaskmovps",
                                  int_x86_avx_maskload_ps,
                                  int_x86_avx_maskload_ps_256,
                                  int_x86_avx_maskstore_ps,
-                                 int_x86_avx_maskstore_ps_256,
-                                 memopv4f32, memopv8f32>;
+                                 int_x86_avx_maskstore_ps_256>;
+let ExeDomain = SSEPackedDouble in
 defm VMASKMOVPD : avx_movmask_rm<0x2D, 0x2F, "vmaskmovpd",
                                  int_x86_avx_maskload_pd,
                                  int_x86_avx_maskload_pd_256,
                                  int_x86_avx_maskstore_pd,
-                                 int_x86_avx_maskstore_pd_256,
-                                 memopv2f64, memopv4f64>;
+                                 int_x86_avx_maskstore_pd_256>;
 
 //===----------------------------------------------------------------------===//
 // VPERMIL - Permute Single and Double Floating-Point Values
@@ -7361,22 +7300,26 @@ multiclass avx_permil<bits<8> opc_rm, bits<8> opc_rmi, string OpcodeStr,
              [(set RC:$dst, (IntImm (f_frag addr:$src1), imm:$src2))]>, VEX;
 }
 
-defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
-                             memopv4f32, memopv4i32,
-                             int_x86_avx_vpermilvar_ps,
-                             int_x86_avx_vpermil_ps>;
-defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
-                             memopv8f32, memopv8i32,
-                             int_x86_avx_vpermilvar_ps_256,
-                             int_x86_avx_vpermil_ps_256>;
-defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
-                             memopv2f64, memopv2i64,
-                             int_x86_avx_vpermilvar_pd,
-                             int_x86_avx_vpermil_pd>;
-defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
-                             memopv4f64, memopv4i64,
-                             int_x86_avx_vpermilvar_pd_256,
-                             int_x86_avx_vpermil_pd_256>;
+let ExeDomain = SSEPackedSingle in {
+  defm VPERMILPS  : avx_permil<0x0C, 0x04, "vpermilps", VR128, f128mem, i128mem,
+                               memopv4f32, memopv4i32,
+                               int_x86_avx_vpermilvar_ps,
+                               int_x86_avx_vpermil_ps>;
+  defm VPERMILPSY : avx_permil<0x0C, 0x04, "vpermilps", VR256, f256mem, i256mem,
+                               memopv8f32, memopv8i32,
+                               int_x86_avx_vpermilvar_ps_256,
+                               int_x86_avx_vpermil_ps_256>;
+}
+let ExeDomain = SSEPackedDouble in {
+  defm VPERMILPD  : avx_permil<0x0D, 0x05, "vpermilpd", VR128, f128mem, i128mem,
+                               memopv2f64, memopv2i64,
+                               int_x86_avx_vpermilvar_pd,
+                               int_x86_avx_vpermil_pd>;
+  defm VPERMILPDY : avx_permil<0x0D, 0x05, "vpermilpd", VR256, f256mem, i256mem,
+                               memopv4f64, memopv4i64,
+                               int_x86_avx_vpermilvar_pd_256,
+                               int_x86_avx_vpermil_pd_256>;
+}
 
 def : Pat<(v8f32 (X86VPermilpsy VR256:$src1, (i8 imm:$imm))),
           (VPERMILPSYri VR256:$src1, imm:$imm)>;
@@ -7549,6 +7492,40 @@ defm VPBROADCASTQ  : avx2_broadcast<0x59, "vpbroadcastq", i64mem, loadi64,
                                     int_x86_avx2_pbroadcastq_128,
                                     int_x86_avx2_pbroadcastq_256>;
 
+let Predicates = [HasAVX2] in {
+  def : Pat<(v16i8 (X86VBroadcast (loadi8 addr:$src))),
+          (VPBROADCASTBrm addr:$src)>;
+  def : Pat<(v32i8 (X86VBroadcast (loadi8 addr:$src))),
+          (VPBROADCASTBYrm addr:$src)>;
+  def : Pat<(v8i16 (X86VBroadcast (loadi16 addr:$src))),
+          (VPBROADCASTWrm addr:$src)>;
+  def : Pat<(v16i16 (X86VBroadcast (loadi16 addr:$src))),
+          (VPBROADCASTWYrm addr:$src)>;
+  def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VPBROADCASTDrm addr:$src)>;
+  def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VPBROADCASTDYrm addr:$src)>;
+  def : Pat<(v2i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VPBROADCASTQrm addr:$src)>;
+  def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VPBROADCASTQYrm addr:$src)>;
+}
+
+// AVX1 broadcast patterns
+def : Pat<(v8i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4i64 (X86VBroadcast (loadi64 addr:$src))),
+          (VBROADCASTSDrm addr:$src)>;
+def : Pat<(v8f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSYrm addr:$src)>;
+def : Pat<(v4f64 (X86VBroadcast (loadf64 addr:$src))),
+          (VBROADCASTSDrm addr:$src)>;
+
+def : Pat<(v4f32 (X86VBroadcast (loadf32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+def : Pat<(v4i32 (X86VBroadcast (loadi32 addr:$src))),
+          (VBROADCASTSSrm addr:$src)>;
+
 //===----------------------------------------------------------------------===//
 // VPERM - Permute instructions
 //
@@ -7569,6 +7546,7 @@ multiclass avx2_perm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
 }
 
 defm VPERMD : avx2_perm<0x36, "vpermd", memopv8i32, int_x86_avx2_permd>;
+let ExeDomain = SSEPackedSingle in
 defm VPERMPS : avx2_perm<0x16, "vpermps", memopv8f32, int_x86_avx2_permps>;
 
 multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
@@ -7588,6 +7566,7 @@ multiclass avx2_perm_imm<bits<8> opc, string OpcodeStr, PatFrag mem_frag,
 
 defm VPERMQ : avx2_perm_imm<0x00, "vpermq", memopv4i64, int_x86_avx2_permq>,
                             VEX_W;
+let ExeDomain = SSEPackedDouble in
 defm VPERMPD : avx2_perm_imm<0x01, "vpermpd", memopv4f64, int_x86_avx2_permpd>,
                              VEX_W;
 
@@ -7643,8 +7622,7 @@ def VEXTRACTI128mr : AVX2AIi8<0x39, MRMDestMem, (outs),
 //
 multiclass avx2_pmovmask<string OpcodeStr,
                          Intrinsic IntLd128, Intrinsic IntLd256,
-                         Intrinsic IntSt128, Intrinsic IntSt256,
-                         PatFrag pf128, PatFrag pf256> {
+                         Intrinsic IntSt128, Intrinsic IntSt256> {
   def rm  : AVX28I<0x8c, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
@@ -7667,124 +7645,49 @@ defm VPMASKMOVD : avx2_pmovmask<"vpmaskmovd",
                                 int_x86_avx2_maskload_d,
                                 int_x86_avx2_maskload_d_256,
                                 int_x86_avx2_maskstore_d,
-                                int_x86_avx2_maskstore_d_256,
-                                memopv4i32, memopv8i32>;
+                                int_x86_avx2_maskstore_d_256>;
 defm VPMASKMOVQ : avx2_pmovmask<"vpmaskmovq",
                                 int_x86_avx2_maskload_q,
                                 int_x86_avx2_maskload_q_256,
                                 int_x86_avx2_maskstore_q,
-                                int_x86_avx2_maskstore_q_256,
-                                memopv2i64, memopv4i64>, VEX_W;
+                                int_x86_avx2_maskstore_q_256>, VEX_W;
 
 
 //===----------------------------------------------------------------------===//
 // Variable Bit Shifts
 //
-multiclass avx2_var_shift<bits<8> opc, string OpcodeStr,
-                          Intrinsic Int128, Intrinsic Int256> {
+multiclass avx2_var_shift<bits<8> opc, string OpcodeStr, SDNode OpNode,
+                          ValueType vt128, ValueType vt256> {
   def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
              (ins VR128:$src1, VR128:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2))]>, VEX_4V;
-  def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
-             (ins VR128:$src1, i128mem:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
-              (Int128 VR128:$src1, (bitconvert (memopv2i64 addr:$src2))))]>,
-             VEX_4V;
-  def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
-             (ins VR256:$src1, VR256:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2))]>, VEX_4V;
-  def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
-             (ins VR256:$src1, i256mem:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst,
-               (Int256 VR256:$src1, (bitconvert (memopv4i64 addr:$src2))))]>,
+               (vt128 (OpNode VR128:$src1, (vt128 VR128:$src2))))]>,
              VEX_4V;
-}
-
-multiclass avx2_var_shift_i64<bits<8> opc, string OpcodeStr,
-                              Intrinsic Int128, Intrinsic Int256> {
-  def rr  : AVX28I<opc, MRMSrcReg, (outs VR128:$dst),
-             (ins VR128:$src1, VR128:$src2),
-             !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR128:$dst, (Int128 VR128:$src1, VR128:$src2))]>, VEX_4V;
   def rm  : AVX28I<opc, MRMSrcMem, (outs VR128:$dst),
              (ins VR128:$src1, i128mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR128:$dst,
-              (Int128 VR128:$src1, (memopv2i64 addr:$src2)))]>,
+               (vt128 (OpNode VR128:$src1,
+                       (vt128 (bitconvert (memopv2i64 addr:$src2))))))]>,
              VEX_4V;
   def Yrr : AVX28I<opc, MRMSrcReg, (outs VR256:$dst),
              (ins VR256:$src1, VR256:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
-             [(set VR256:$dst, (Int256 VR256:$src1, VR256:$src2))]>, VEX_4V;
+             [(set VR256:$dst,
+               (vt256 (OpNode VR256:$src1, (vt256 VR256:$src2))))]>,
+             VEX_4V;
   def Yrm : AVX28I<opc, MRMSrcMem, (outs VR256:$dst),
              (ins VR256:$src1, i256mem:$src2),
              !strconcat(OpcodeStr, "\t{$src2, $src1, $dst|$dst, $src1, $src2}"),
              [(set VR256:$dst,
-               (Int256 VR256:$src1, (memopv4i64 addr:$src2)))]>,
+               (vt256 (OpNode VR256:$src1,
+                       (vt256 (bitconvert (memopv4i64 addr:$src2))))))]>,
              VEX_4V;
 }
 
-defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", int_x86_avx2_psllv_d,
-                              int_x86_avx2_psllv_d_256>;
-defm VPSLLVQ : avx2_var_shift_i64<0x47, "vpsllvq", int_x86_avx2_psllv_q,
-                                  int_x86_avx2_psllv_q_256>, VEX_W;
-defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", int_x86_avx2_psrlv_d,
-                              int_x86_avx2_psrlv_d_256>;
-defm VPSRLVQ : avx2_var_shift_i64<0x45, "vpsrlvq", int_x86_avx2_psrlv_q,
-                                  int_x86_avx2_psrlv_q_256>, VEX_W;
-defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", int_x86_avx2_psrav_d,
-                              int_x86_avx2_psrav_d_256>;
-
-let Predicates = [HasAVX2] in {
-  def : Pat<(v4i32 (shl (v4i32 VR128:$src1), (v4i32 VR128:$src2))),
-            (VPSLLVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v2i64 (shl (v2i64 VR128:$src1), (v2i64 VR128:$src2))),
-            (VPSLLVQrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (srl (v4i32 VR128:$src1), (v4i32 VR128:$src2))),
-            (VPSRLVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v2i64 (srl (v2i64 VR128:$src1), (v2i64 VR128:$src2))),
-            (VPSRLVQrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v4i32 (sra (v4i32 VR128:$src1), (v4i32 VR128:$src2))),
-            (VPSRAVDrr VR128:$src1, VR128:$src2)>;
-  def : Pat<(v8i32 (shl (v8i32 VR256:$src1), (v8i32 VR256:$src2))),
-            (VPSLLVDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (shl (v4i64 VR256:$src1), (v4i64 VR256:$src2))),
-            (VPSLLVQYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (srl (v8i32 VR256:$src1), (v8i32 VR256:$src2))),
-            (VPSRLVDYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v4i64 (srl (v4i64 VR256:$src1), (v4i64 VR256:$src2))),
-            (VPSRLVQYrr VR256:$src1, VR256:$src2)>;
-  def : Pat<(v8i32 (sra (v8i32 VR256:$src1), (v8i32 VR256:$src2))),
-            (VPSRAVDYrr VR256:$src1, VR256:$src2)>;
-
-  def : Pat<(v4i32 (shl (v4i32 VR128:$src1),
-                    (v4i32 (bitconvert (memopv2i64 addr:$src2))))),
-            (VPSLLVDrm VR128:$src1,  addr:$src2)>;
-  def : Pat<(v2i64 (shl (v2i64 VR128:$src1), (memopv2i64 addr:$src2))),
-            (VPSLLVQrm VR128:$src1,  addr:$src2)>;
-  def : Pat<(v4i32 (srl (v4i32 VR128:$src1),
-                    (v4i32 (bitconvert (memopv2i64 addr:$src2))))),
-            (VPSRLVDrm VR128:$src1,  addr:$src2)>;
-  def : Pat<(v2i64 (srl (v2i64 VR128:$src1), (memopv2i64 addr:$src2))),
-            (VPSRLVQrm VR128:$src1,  addr:$src2)>;
-  def : Pat<(v4i32 (sra (v4i32 VR128:$src1),
-                    (v4i32 (bitconvert (memopv2i64 addr:$src2))))),
-            (VPSRAVDrm VR128:$src1,  addr:$src2)>;
-  def : Pat<(v8i32 (shl (v8i32 VR256:$src1),
-                    (v8i32 (bitconvert (memopv4i64 addr:$src2))))),
-            (VPSLLVDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (shl (v4i64 VR256:$src1), (memopv4i64 addr:$src2))),
-            (VPSLLVQYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (srl (v8i32 VR256:$src1),
-                    (v8i32 (bitconvert (memopv4i64 addr:$src2))))),
-            (VPSRLVDYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v4i64 (srl (v4i64 VR256:$src1), (memopv4i64 addr:$src2))),
-            (VPSRLVQYrm VR256:$src1, addr:$src2)>;
-  def : Pat<(v8i32 (sra (v8i32 VR256:$src1),
-                    (v8i32 (bitconvert (memopv4i64 addr:$src2))))),
-            (VPSRAVDYrm VR256:$src1, addr:$src2)>;
-}
+defm VPSLLVD : avx2_var_shift<0x47, "vpsllvd", shl, v4i32, v8i32>;
+defm VPSLLVQ : avx2_var_shift<0x47, "vpsllvq", shl, v2i64, v4i64>, VEX_W;
+defm VPSRLVD : avx2_var_shift<0x45, "vpsrlvd", srl, v4i32, v8i32>;
+defm VPSRLVQ : avx2_var_shift<0x45, "vpsrlvq", srl, v2i64, v4i64>, VEX_W;
+defm VPSRAVD : avx2_var_shift<0x46, "vpsravd", sra, v4i32, v8i32>;
diff --git a/lib/Target/X86/X86MCInstLower.cpp b/lib/Target/X86/X86MCInstLower.cpp
index 328cf67..81ee665 100644
--- a/lib/Target/X86/X86MCInstLower.cpp
+++ b/lib/Target/X86/X86MCInstLower.cpp
@@ -376,6 +376,7 @@ ReSimplify:
   case X86::AVX_SET0PSY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPSYrr); break;
   case X86::AVX_SET0PDY:   LowerUnaryToTwoAddr(OutMI, X86::VXORPDYrr); break;
   case X86::AVX_SETALLONES:  LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDrr); break;
+  case X86::AVX2_SETALLONES: LowerUnaryToTwoAddr(OutMI, X86::VPCMPEQDYrr);break;
 
   case X86::MOV16r0:
     LowerSubReg32_Op0(OutMI, X86::MOV32r0);   // MOV16r0 -> MOV32r0
diff --git a/lib/Target/X86/X86Subtarget.h b/lib/Target/X86/X86Subtarget.h
index 763fb43..e93f8e9 100644
--- a/lib/Target/X86/X86Subtarget.h
+++ b/lib/Target/X86/X86Subtarget.h
@@ -190,6 +190,10 @@ public:
   bool hasAVX2() const { return HasAVX2; }
   bool hasXMM() const { return hasSSE1() || hasAVX(); }
   bool hasXMMInt() const { return hasSSE2() || hasAVX(); }
+  bool hasSSE3orAVX() const { return hasSSE3() || hasAVX(); }
+  bool hasSSSE3orAVX() const { return hasSSSE3() || hasAVX(); }
+  bool hasSSE41orAVX() const { return hasSSE41() || hasAVX(); }
+  bool hasSSE42orAVX() const { return hasSSE42() || hasAVX(); }
   bool hasAES() const { return HasAES; }
   bool hasCLMUL() const { return HasCLMUL; }
   bool hasFMA3() const { return HasFMA3; }
diff --git a/lib/Target/X86/X86TargetMachine.cpp b/lib/Target/X86/X86TargetMachine.cpp
index 4d4d7c0..1c9f3bd 100644
--- a/lib/Target/X86/X86TargetMachine.cpp
+++ b/lib/Target/X86/X86TargetMachine.cpp
@@ -31,8 +31,9 @@ extern "C" void LLVMInitializeX86Target() {
 
 X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
                                          StringRef CPU, StringRef FS,
-                                         Reloc::Model RM, CodeModel::Model CM)
-  : X86TargetMachine(T, TT, CPU, FS, RM, CM, false),
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+  : X86TargetMachine(T, TT, CPU, FS, RM, CM, OL, false),
     DataLayout(getSubtargetImpl()->isTargetDarwin() ?
                "e-p:32:32-f64:32:64-i64:32:64-f80:128:128-f128:128:128-"
                "n8:16:32-S128" :
@@ -51,8 +52,9 @@ X86_32TargetMachine::X86_32TargetMachine(const Target &T, StringRef TT,
 
 X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
                                          StringRef CPU, StringRef FS,
-                                         Reloc::Model RM, CodeModel::Model CM)
-  : X86TargetMachine(T, TT, CPU, FS, RM, CM, true),
+                                         Reloc::Model RM, CodeModel::Model CM,
+                                         CodeGenOpt::Level OL)
+  : X86TargetMachine(T, TT, CPU, FS, RM, CM, OL, true),
     DataLayout("e-p:64:64-s:64-f64:64:64-i64:64:64-f80:128:128-f128:128:128-"
                "n8:16:32:64-S128"),
     InstrInfo(*this),
@@ -66,8 +68,9 @@ X86_64TargetMachine::X86_64TargetMachine(const Target &T, StringRef TT,
 X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
                                    StringRef CPU, StringRef FS,
                                    Reloc::Model RM, CodeModel::Model CM,
+                                   CodeGenOpt::Level OL,
                                    bool is64Bit)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS, StackAlignmentOverride, is64Bit),
     FrameLowering(*this, Subtarget),
     ELFWriterInfo(is64Bit, true) {
@@ -102,16 +105,15 @@ X86TargetMachine::X86TargetMachine(const Target &T, StringRef TT,
 static cl::opt<bool>
 UseVZeroUpper("x86-use-vzeroupper",
   cl::desc("Minimize AVX to SSE transition penalty"),
-  cl::init(false));
+  cl::init(true));
 
 //===----------------------------------------------------------------------===//
 // Pass Pipeline Configuration
 //===----------------------------------------------------------------------===//
 
-bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
-                                       CodeGenOpt::Level OptLevel) {
+bool X86TargetMachine::addInstSelector(PassManagerBase &PM) {
   // Install an instruction selector.
-  PM.add(createX86ISelDag(*this, OptLevel));
+  PM.add(createX86ISelDag(*this, getOptLevel()));
 
   // For 32-bit, prepend instructions to set the "global base reg" for PIC.
   if (!Subtarget.is64Bit())
@@ -120,33 +122,21 @@ bool X86TargetMachine::addInstSelector(PassManagerBase &PM,
   return false;
 }
 
-bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM,
-                                      CodeGenOpt::Level OptLevel) {
+bool X86TargetMachine::addPreRegAlloc(PassManagerBase &PM) {
   PM.add(createX86MaxStackAlignmentHeuristicPass());
   return false;  // -print-machineinstr shouldn't print after this.
 }
 
-bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM,
-                                       CodeGenOpt::Level OptLevel) {
+bool X86TargetMachine::addPostRegAlloc(PassManagerBase &PM) {
   PM.add(createX86FloatingPointStackifierPass());
   return true;  // -print-machineinstr should print after this.
 }
 
-bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
-                                      CodeGenOpt::Level OptLevel) {
+bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM) {
   bool ShouldPrint = false;
-  if (OptLevel != CodeGenOpt::None) {
-    if (Subtarget.hasXMMInt()) {
-      PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
-      ShouldPrint = true;
-    }
-    if (Subtarget.hasAVX2()) {
-      // FIXME this should be turned on for just AVX, but the pass doesn't check
-      // that instructions are valid before replacing them and there are AVX2
-      // integer instructions in the table.
-      PM.add(createExecutionDependencyFixPass(&X86::VR256RegClass));
-      ShouldPrint = true;
-    }
+  if (getOptLevel() != CodeGenOpt::None && Subtarget.hasXMMInt()) {
+    PM.add(createExecutionDependencyFixPass(&X86::VR128RegClass));
+    ShouldPrint = true;
   }
 
   if (Subtarget.hasAVX() && UseVZeroUpper) {
@@ -158,7 +148,6 @@ bool X86TargetMachine::addPreEmitPass(PassManagerBase &PM,
 }
 
 bool X86TargetMachine::addCodeEmitter(PassManagerBase &PM,
-                                      CodeGenOpt::Level OptLevel,
                                       JITCodeEmitter &JCE) {
   PM.add(createX86JITCodeEmitterPass(*this, JCE));
 
diff --git a/lib/Target/X86/X86TargetMachine.h b/lib/Target/X86/X86TargetMachine.h
index d1569aa..64be458 100644
--- a/lib/Target/X86/X86TargetMachine.h
+++ b/lib/Target/X86/X86TargetMachine.h
@@ -40,6 +40,7 @@ public:
   X86TargetMachine(const Target &T, StringRef TT, 
                    StringRef CPU, StringRef FS,
                    Reloc::Model RM, CodeModel::Model CM,
+                   CodeGenOpt::Level OL,
                    bool is64Bit);
 
   virtual const X86InstrInfo     *getInstrInfo() const {
@@ -66,11 +67,11 @@ public:
   }
 
   // Set up the pass pipeline.
-  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPostRegAlloc(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addPreEmitPass(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
-  virtual bool addCodeEmitter(PassManagerBase &PM, CodeGenOpt::Level OptLevel,
+  virtual bool addInstSelector(PassManagerBase &PM);
+  virtual bool addPreRegAlloc(PassManagerBase &PM);
+  virtual bool addPostRegAlloc(PassManagerBase &PM);
+  virtual bool addPreEmitPass(PassManagerBase &PM);
+  virtual bool addCodeEmitter(PassManagerBase &PM,
                               JITCodeEmitter &JCE);
 };
 
@@ -85,7 +86,8 @@ class X86_32TargetMachine : public X86TargetMachine {
 public:
   X86_32TargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM);
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
@@ -112,7 +114,8 @@ class X86_64TargetMachine : public X86TargetMachine {
 public:
   X86_64TargetMachine(const Target &T, StringRef TT,
                       StringRef CPU, StringRef FS,
-                      Reloc::Model RM, CodeModel::Model CM);
+                      Reloc::Model RM, CodeModel::Model CM,
+                      CodeGenOpt::Level OL);
   virtual const TargetData *getTargetData() const { return &DataLayout; }
   virtual const X86TargetLowering *getTargetLowering() const {
     return &TLInfo;
diff --git a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
index 276e841..7d5fcce 100644
--- a/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
+++ b/lib/Target/XCore/MCTargetDesc/XCoreMCTargetDesc.cpp
@@ -61,9 +61,10 @@ static MCAsmInfo *createXCoreMCAsmInfo(const Target &T, StringRef TT) {
 }
 
 static MCCodeGenInfo *createXCoreMCCodeGenInfo(StringRef TT, Reloc::Model RM,
-                                               CodeModel::Model CM) {
+                                               CodeModel::Model CM,
+                                               CodeGenOpt::Level OL) {
   MCCodeGenInfo *X = new MCCodeGenInfo();
-  X->InitMCCodeGenInfo(RM, CM);
+  X->InitMCCodeGenInfo(RM, CM, OL);
   return X;
 }
 
diff --git a/lib/Target/XCore/XCoreTargetMachine.cpp b/lib/Target/XCore/XCoreTargetMachine.cpp
index fdc5d35..eec3674 100644
--- a/lib/Target/XCore/XCoreTargetMachine.cpp
+++ b/lib/Target/XCore/XCoreTargetMachine.cpp
@@ -21,8 +21,9 @@ using namespace llvm;
 ///
 XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
                                        StringRef CPU, StringRef FS,
-                                       Reloc::Model RM, CodeModel::Model CM)
-  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM),
+                                       Reloc::Model RM, CodeModel::Model CM,
+                                       CodeGenOpt::Level OL)
+  : LLVMTargetMachine(T, TT, CPU, FS, RM, CM, OL),
     Subtarget(TT, CPU, FS),
     DataLayout("e-p:32:32:32-a0:0:32-f32:32:32-f64:32:32-i1:8:32-i8:8:32-"
                "i16:16:32-i32:32:32-i64:32:32-n32"),
@@ -32,8 +33,7 @@ XCoreTargetMachine::XCoreTargetMachine(const Target &T, StringRef TT,
     TSInfo(*this) {
 }
 
-bool XCoreTargetMachine::addInstSelector(PassManagerBase &PM,
-                                         CodeGenOpt::Level OptLevel) {
+bool XCoreTargetMachine::addInstSelector(PassManagerBase &PM) {
   PM.add(createXCoreISelDag(*this));
   return false;
 }
diff --git a/lib/Target/XCore/XCoreTargetMachine.h b/lib/Target/XCore/XCoreTargetMachine.h
index 83d09d6..3f2644d 100644
--- a/lib/Target/XCore/XCoreTargetMachine.h
+++ b/lib/Target/XCore/XCoreTargetMachine.h
@@ -34,7 +34,8 @@ class XCoreTargetMachine : public LLVMTargetMachine {
 public:
   XCoreTargetMachine(const Target &T, StringRef TT,
                      StringRef CPU, StringRef FS,
-                     Reloc::Model RM, CodeModel::Model CM);
+                     Reloc::Model RM, CodeModel::Model CM,
+                     CodeGenOpt::Level OL);
 
   virtual const XCoreInstrInfo *getInstrInfo() const { return &InstrInfo; }
   virtual const XCoreFrameLowering *getFrameLowering() const {
@@ -55,7 +56,7 @@ public:
   virtual const TargetData       *getTargetData() const { return &DataLayout; }
 
   // Pass Pipeline Configuration
-  virtual bool addInstSelector(PassManagerBase &PM, CodeGenOpt::Level OptLevel);
+  virtual bool addInstSelector(PassManagerBase &PM);
 };
 
 } // end namespace llvm
diff --git a/lib/Transforms/IPO/DeadArgumentElimination.cpp b/lib/Transforms/IPO/DeadArgumentElimination.cpp
index 4bb6f7a..95aef27 100644
--- a/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -74,7 +74,7 @@ namespace {
 
       std::string getDescription() const {
         return std::string((IsArg ? "Argument #" : "Return value #"))
-               + utostr(Idx) + " of function " + F->getNameStr();
+               + utostr(Idx) + " of function " + F->getName().str();
       }
     };
 
diff --git a/lib/Transforms/InstCombine/InstCombineCalls.cpp b/lib/Transforms/InstCombine/InstCombineCalls.cpp
index c7b3ff8..e8136ab 100644
--- a/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -760,7 +760,7 @@ static bool isSafeToEliminateVarargsCast(const CallSite CS,
   // The size of ByVal arguments is derived from the type, so we
   // can't change to a type with a different size.  If the size were
   // passed explicitly we could avoid this check.
-  if (!CS.paramHasAttr(ix, Attribute::ByVal))
+  if (!CS.isByValArgument(ix))
     return true;
 
   Type* SrcTy = 
@@ -960,7 +960,7 @@ Instruction *InstCombiner::visitCallSite(CallSite CS) {
   PointerType *PTy = cast<PointerType>(Callee->getType());
   FunctionType *FTy = cast<FunctionType>(PTy->getElementType());
   if (FTy->isVarArg()) {
-    int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 3 : 1);
+    int ix = FTy->getNumParams() + (isa<InvokeInst>(Callee) ? 2 : 0);
     // See if we can optimize any arguments passed through the varargs area of
     // the call.
     for (CallSite::arg_iterator I = CS.arg_begin()+FTy->getNumParams(),
diff --git a/lib/Transforms/InstCombine/InstructionCombining.cpp b/lib/Transforms/InstCombine/InstructionCombining.cpp
index 0cc969b..a7a6311 100644
--- a/lib/Transforms/InstCombine/InstructionCombining.cpp
+++ b/lib/Transforms/InstCombine/InstructionCombining.cpp
@@ -1902,7 +1902,7 @@ bool InstCombiner::DoOneIteration(Function &F, unsigned Iteration) {
   MadeIRChange = false;
   
   DEBUG(errs() << "\n\nINSTCOMBINE ITERATION #" << Iteration << " on "
-        << F.getNameStr() << "\n");
+               << F.getName() << "\n");
 
   {
     // Do a depth-first traversal of the function, populate the worklist with
diff --git a/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
new file mode 100644
index 0000000..b617539
--- /dev/null
+++ b/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -0,0 +1,987 @@
+//===-- AddressSanitizer.cpp - memory error detector ------------*- C++ -*-===//
+//
+//                     The LLVM Compiler Infrastructure
+//
+// This file is distributed under the University of Illinois Open Source
+// License. See LICENSE.TXT for details.
+//
+//===----------------------------------------------------------------------===//
+//
+// This file is a part of AddressSanitizer, an address sanity checker.
+// Details of the algorithm:
+//  http://code.google.com/p/address-sanitizer/wiki/AddressSanitizerAlgorithm
+//
+//===----------------------------------------------------------------------===//
+
+#define DEBUG_TYPE "asan"
+
+#include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/OwningPtr.h"
+#include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringExtras.h"
+#include "llvm/Function.h"
+#include "llvm/InlineAsm.h"
+#include "llvm/IntrinsicInst.h"
+#include "llvm/LLVMContext.h"
+#include "llvm/Module.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Support/DataTypes.h"
+#include "llvm/Support/Debug.h"
+#include "llvm/Support/IRBuilder.h"
+#include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/Regex.h"
+#include "llvm/Support/raw_ostream.h"
+#include "llvm/Support/system_error.h"
+#include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetMachine.h"
+#include "llvm/Transforms/Instrumentation.h"
+#include "llvm/Transforms/Utils/BasicBlockUtils.h"
+#include "llvm/Transforms/Utils/ModuleUtils.h"
+#include "llvm/Type.h"
+
+#include <string>
+#include <algorithm>
+
+using namespace llvm;
+
+static const uint64_t kDefaultShadowScale = 3;
+static const uint64_t kDefaultShadowOffset32 = 1ULL << 29;
+static const uint64_t kDefaultShadowOffset64 = 1ULL << 44;
+
+static const size_t kMaxStackMallocSize = 1 << 16;  // 64K
+static const uintptr_t kCurrentStackFrameMagic = 0x41B58AB3;
+static const uintptr_t kRetiredStackFrameMagic = 0x45E0360E;
+
+static const char *kAsanModuleCtorName = "asan.module_ctor";
+static const char *kAsanReportErrorTemplate = "__asan_report_";
+static const char *kAsanRegisterGlobalsName = "__asan_register_globals";
+static const char *kAsanInitName = "__asan_init";
+static const char *kAsanMappingOffsetName = "__asan_mapping_offset";
+static const char *kAsanMappingScaleName = "__asan_mapping_scale";
+static const char *kAsanStackMallocName = "__asan_stack_malloc";
+static const char *kAsanStackFreeName = "__asan_stack_free";
+
+static const int kAsanStackLeftRedzoneMagic = 0xf1;
+static const int kAsanStackMidRedzoneMagic = 0xf2;
+static const int kAsanStackRightRedzoneMagic = 0xf3;
+static const int kAsanStackPartialRedzoneMagic = 0xf4;
+
+// Command-line flags.
+
+// This flag may need to be replaced with -f[no-]asan-reads.
+static cl::opt<bool> ClInstrumentReads("asan-instrument-reads",
+       cl::desc("instrument read instructions"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClInstrumentWrites("asan-instrument-writes",
+       cl::desc("instrument write instructions"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -f[no]asan-stack.
+static cl::opt<bool> ClStack("asan-stack",
+       cl::desc("Handle stack memory"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -f[no]asan-use-after-return.
+static cl::opt<bool> ClUseAfterReturn("asan-use-after-return",
+       cl::desc("Check return-after-free"), cl::Hidden, cl::init(false));
+// This flag may need to be replaced with -f[no]asan-globals.
+static cl::opt<bool> ClGlobals("asan-globals",
+       cl::desc("Handle global objects"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClMemIntrin("asan-memintrin",
+       cl::desc("Handle memset/memcpy/memmove"), cl::Hidden, cl::init(true));
+// This flag may need to be replaced with -fasan-blacklist.
+static cl::opt<std::string>  ClBlackListFile("asan-blacklist",
+       cl::desc("File containing the list of functions to ignore "
+                "during instrumentation"), cl::Hidden);
+static cl::opt<bool> ClUseCall("asan-use-call",
+       cl::desc("Use function call to generate a crash"), cl::Hidden,
+       cl::init(true));
+
+// These flags allow to change the shadow mapping.
+// The shadow mapping looks like
+//    Shadow = (Mem >> scale) + (1 << offset_log)
+static cl::opt<int> ClMappingScale("asan-mapping-scale",
+       cl::desc("scale of asan shadow mapping"), cl::Hidden, cl::init(0));
+static cl::opt<int> ClMappingOffsetLog("asan-mapping-offset-log",
+       cl::desc("offset of asan shadow mapping"), cl::Hidden, cl::init(-1));
+
+// Optimization flags. Not user visible, used mostly for testing
+// and benchmarking the tool.
+static cl::opt<bool> ClOpt("asan-opt",
+       cl::desc("Optimize instrumentation"), cl::Hidden, cl::init(true));
+static cl::opt<bool> ClOptSameTemp("asan-opt-same-temp",
+       cl::desc("Instrument the same temp just once"), cl::Hidden,
+       cl::init(true));
+static cl::opt<bool> ClOptGlobals("asan-opt-globals",
+       cl::desc("Don't instrument scalar globals"), cl::Hidden, cl::init(true));
+
+// Debug flags.
+static cl::opt<int> ClDebug("asan-debug", cl::desc("debug"), cl::Hidden,
+                            cl::init(0));
+static cl::opt<int> ClDebugStack("asan-debug-stack", cl::desc("debug stack"),
+                                 cl::Hidden, cl::init(0));
+static cl::opt<std::string> ClDebugFunc("asan-debug-func",
+                                        cl::Hidden, cl::desc("Debug func"));
+static cl::opt<int> ClDebugMin("asan-debug-min", cl::desc("Debug min inst"),
+                               cl::Hidden, cl::init(-1));
+static cl::opt<int> ClDebugMax("asan-debug-max", cl::desc("Debug man inst"),
+                               cl::Hidden, cl::init(-1));
+
+namespace {
+
+// Blacklisted functions are not instrumented.
+// The blacklist file contains one or more lines like this:
+// ---
+// fun:FunctionWildCard
+// ---
+// This is similar to the "ignore" feature of ThreadSanitizer.
+// http://code.google.com/p/data-race-test/wiki/ThreadSanitizerIgnores
+class BlackList {
+ public:
+  BlackList(const std::string &Path);
+  bool isIn(const Function &F);
+ private:
+  Regex *Functions;
+};
+
+/// AddressSanitizer: instrument the code in module to find memory bugs.
+struct AddressSanitizer : public ModulePass {
+  AddressSanitizer();
+  void instrumentMop(Instruction *I);
+  void instrumentAddress(Instruction *OrigIns, IRBuilder<> &IRB,
+                         Value *Addr, uint32_t TypeSize, bool IsWrite);
+  Instruction *generateCrashCode(IRBuilder<> &IRB, Value *Addr,
+                                 bool IsWrite, uint32_t TypeSize);
+  bool instrumentMemIntrinsic(MemIntrinsic *MI);
+  void instrumentMemIntrinsicParam(Instruction *OrigIns, Value *Addr,
+                                  Value *Size,
+                                   Instruction *InsertBefore, bool IsWrite);
+  Value *memToShadow(Value *Shadow, IRBuilder<> &IRB);
+  bool handleFunction(Module &M, Function &F);
+  bool poisonStackInFunction(Module &M, Function &F);
+  virtual bool runOnModule(Module &M);
+  bool insertGlobalRedzones(Module &M);
+  BranchInst *splitBlockAndInsertIfThen(Instruction *SplitBefore, Value *Cmp);
+  static char ID;  // Pass identification, replacement for typeid
+
+ private:
+
+  uint64_t getAllocaSizeInBytes(AllocaInst *AI) {
+    Type *Ty = AI->getAllocatedType();
+    uint64_t SizeInBytes = TD->getTypeStoreSizeInBits(Ty) / 8;
+    return SizeInBytes;
+  }
+  uint64_t getAlignedSize(uint64_t SizeInBytes) {
+    return ((SizeInBytes + RedzoneSize - 1)
+            / RedzoneSize) * RedzoneSize;
+  }
+  uint64_t getAlignedAllocaSize(AllocaInst *AI) {
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    return getAlignedSize(SizeInBytes);
+  }
+
+  void PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec, IRBuilder<> IRB,
+                   Value *ShadowBase, bool DoPoison);
+  bool LooksLikeCodeInBug11395(Instruction *I);
+
+  Module      *CurrentModule;
+  LLVMContext *C;
+  TargetData *TD;
+  uint64_t MappingOffset;
+  int MappingScale;
+  size_t RedzoneSize;
+  int LongSize;
+  Type *IntptrTy;
+  Type *IntptrPtrTy;
+  Function *AsanCtorFunction;
+  Function *AsanInitFunction;
+  Instruction *CtorInsertBefore;
+  OwningPtr<BlackList> BL;
+};
+}  // namespace
+
+char AddressSanitizer::ID = 0;
+INITIALIZE_PASS(AddressSanitizer, "asan",
+    "AddressSanitizer: detects use-after-free and out-of-bounds bugs.",
+    false, false)
+AddressSanitizer::AddressSanitizer() : ModulePass(ID) { }
+ModulePass *llvm::createAddressSanitizerPass() {
+  return new AddressSanitizer();
+}
+
+// Create a constant for Str so that we can pass it to the run-time lib.
+static GlobalVariable *createPrivateGlobalForString(Module &M, StringRef Str) {
+  Constant *StrConst = ConstantArray::get(M.getContext(), Str);
+  return new GlobalVariable(M, StrConst->getType(), true,
+                            GlobalValue::PrivateLinkage, StrConst, "");
+}
+
+// Split the basic block and insert an if-then code.
+// Before:
+//   Head
+//   SplitBefore
+//   Tail
+// After:
+//   Head
+//   if (Cmp)
+//     NewBasicBlock
+//   SplitBefore
+//   Tail
+//
+// Returns the NewBasicBlock's terminator.
+BranchInst *AddressSanitizer::splitBlockAndInsertIfThen(
+    Instruction *SplitBefore, Value *Cmp) {
+  BasicBlock *Head = SplitBefore->getParent();
+  BasicBlock *Tail = Head->splitBasicBlock(SplitBefore);
+  TerminatorInst *HeadOldTerm = Head->getTerminator();
+  BasicBlock *NewBasicBlock =
+      BasicBlock::Create(*C, "", Head->getParent());
+  BranchInst *HeadNewTerm = BranchInst::Create(/*ifTrue*/NewBasicBlock,
+                                               /*ifFalse*/Tail,
+                                               Cmp);
+  ReplaceInstWithInst(HeadOldTerm, HeadNewTerm);
+
+  BranchInst *CheckTerm = BranchInst::Create(Tail, NewBasicBlock);
+  return CheckTerm;
+}
+
+Value *AddressSanitizer::memToShadow(Value *Shadow, IRBuilder<> &IRB) {
+  // Shadow >> scale
+  Shadow = IRB.CreateLShr(Shadow, MappingScale);
+  if (MappingOffset == 0)
+    return Shadow;
+  // (Shadow >> scale) | offset
+  return IRB.CreateOr(Shadow, ConstantInt::get(IntptrTy,
+                                               MappingOffset));
+}
+
+void AddressSanitizer::instrumentMemIntrinsicParam(Instruction *OrigIns,
+    Value *Addr, Value *Size, Instruction *InsertBefore, bool IsWrite) {
+  // Check the first byte.
+  {
+    IRBuilder<> IRB(InsertBefore);
+    instrumentAddress(OrigIns, IRB, Addr, 8, IsWrite);
+  }
+  // Check the last byte.
+  {
+    IRBuilder<> IRB(InsertBefore);
+    Value *SizeMinusOne = IRB.CreateSub(
+        Size, ConstantInt::get(Size->getType(), 1));
+    SizeMinusOne = IRB.CreateIntCast(SizeMinusOne, IntptrTy, false);
+    Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+    Value *AddrPlusSizeMinisOne = IRB.CreateAdd(AddrLong, SizeMinusOne);
+    instrumentAddress(OrigIns, IRB, AddrPlusSizeMinisOne, 8, IsWrite);
+  }
+}
+
+// Instrument memset/memmove/memcpy
+bool AddressSanitizer::instrumentMemIntrinsic(MemIntrinsic *MI) {
+  Value *Dst = MI->getDest();
+  MemTransferInst *MemTran = dyn_cast<MemTransferInst>(MI);
+  Value *Src = MemTran ? MemTran->getSource() : NULL;
+  Value *Length = MI->getLength();
+
+  Constant *ConstLength = dyn_cast<Constant>(Length);
+  Instruction *InsertBefore = MI;
+  if (ConstLength) {
+    if (ConstLength->isNullValue()) return false;
+  } else {
+    // The size is not a constant so it could be zero -- check at run-time.
+    IRBuilder<> IRB(InsertBefore);
+
+    Value *Cmp = IRB.CreateICmpNE(Length,
+                                   Constant::getNullValue(Length->getType()));
+    InsertBefore = splitBlockAndInsertIfThen(InsertBefore, Cmp);
+  }
+
+  instrumentMemIntrinsicParam(MI, Dst, Length, InsertBefore, true);
+  if (Src)
+    instrumentMemIntrinsicParam(MI, Src, Length, InsertBefore, false);
+  return true;
+}
+
+static Value *getLDSTOperand(Instruction *I) {
+  if (LoadInst *LI = dyn_cast<LoadInst>(I)) {
+    return LI->getPointerOperand();
+  }
+  return cast<StoreInst>(*I).getPointerOperand();
+}
+
+void AddressSanitizer::instrumentMop(Instruction *I) {
+  int IsWrite = isa<StoreInst>(*I);
+  Value *Addr = getLDSTOperand(I);
+  if (ClOpt && ClOptGlobals && isa<GlobalVariable>(Addr)) {
+    // We are accessing a global scalar variable. Nothing to catch here.
+    return;
+  }
+  Type *OrigPtrTy = Addr->getType();
+  Type *OrigTy = cast<PointerType>(OrigPtrTy)->getElementType();
+
+  assert(OrigTy->isSized());
+  uint32_t TypeSize = TD->getTypeStoreSizeInBits(OrigTy);
+
+  if (TypeSize != 8  && TypeSize != 16 &&
+      TypeSize != 32 && TypeSize != 64 && TypeSize != 128) {
+    // Ignore all unusual sizes.
+    return;
+  }
+
+  IRBuilder<> IRB(I);
+  instrumentAddress(I, IRB, Addr, TypeSize, IsWrite);
+}
+
+Instruction *AddressSanitizer::generateCrashCode(
+    IRBuilder<> &IRB, Value *Addr, bool IsWrite, uint32_t TypeSize) {
+
+  if (ClUseCall) {
+    // Here we use a call instead of arch-specific asm to report an error.
+    // This is almost always slower (because the codegen needs to generate
+    // prologue/epilogue for otherwise leaf functions) and generates more code.
+    // This mode could be useful if we can not use SIGILL for some reason.
+    //
+    // IsWrite and TypeSize are encoded in the function name.
+    std::string FunctionName = std::string(kAsanReportErrorTemplate) +
+        (IsWrite ? "store" : "load") + itostr(TypeSize / 8);
+    Value *ReportWarningFunc = CurrentModule->getOrInsertFunction(
+        FunctionName, IRB.getVoidTy(), IntptrTy, NULL);
+    CallInst *Call = IRB.CreateCall(ReportWarningFunc, Addr);
+    Call->setDoesNotReturn();
+    return Call;
+  }
+
+  uint32_t LogOfSizeInBytes = CountTrailingZeros_32(TypeSize / 8);
+  assert(8U * (1 << LogOfSizeInBytes) == TypeSize);
+  uint8_t TelltaleValue = IsWrite * 8 + LogOfSizeInBytes;
+  assert(TelltaleValue < 16);
+
+  // Move the failing address to %rax/%eax
+  FunctionType *Fn1Ty = FunctionType::get(
+      IRB.getVoidTy(), ArrayRef<Type*>(IntptrTy), false);
+  const char *MovStr = LongSize == 32
+      ? "mov $0, %eax" : "mov $0, %rax";
+  Value *AsmMov = InlineAsm::get(
+      Fn1Ty, StringRef(MovStr), StringRef("r"), true);
+  IRB.CreateCall(AsmMov, Addr);
+
+  // crash with ud2; could use int3, but it is less friendly to gdb.
+  // after ud2 put a 1-byte instruction that encodes the access type and size.
+
+  const char *TelltaleInsns[16] = {
+    "push   %eax",  // 0x50
+    "push   %ecx",  // 0x51
+    "push   %edx",  // 0x52
+    "push   %ebx",  // 0x53
+    "push   %esp",  // 0x54
+    "push   %ebp",  // 0x55
+    "push   %esi",  // 0x56
+    "push   %edi",  // 0x57
+    "pop    %eax",  // 0x58
+    "pop    %ecx",  // 0x59
+    "pop    %edx",  // 0x5a
+    "pop    %ebx",  // 0x5b
+    "pop    %esp",  // 0x5c
+    "pop    %ebp",  // 0x5d
+    "pop    %esi",  // 0x5e
+    "pop    %edi"   // 0x5f
+  };
+
+  std::string AsmStr = "ud2;";
+  AsmStr += TelltaleInsns[TelltaleValue];
+  Value *MyAsm = InlineAsm::get(FunctionType::get(Type::getVoidTy(*C), false),
+                                StringRef(AsmStr), StringRef(""), true);
+  CallInst *AsmCall = IRB.CreateCall(MyAsm);
+
+  // This saves us one jump, but triggers a bug in RA (or somewhere else):
+  // while building 483.xalancbmk the compiler goes into infinite loop in
+  // llvm::SpillPlacement::iterate() / RAGreedy::growRegion
+  // AsmCall->setDoesNotReturn();
+  return AsmCall;
+}
+
+void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
+                                         IRBuilder<> &IRB, Value *Addr,
+                                         uint32_t TypeSize, bool IsWrite) {
+  Value *AddrLong = IRB.CreatePointerCast(Addr, IntptrTy);
+
+  Type *ShadowTy  = IntegerType::get(
+      *C, std::max(8U, TypeSize >> MappingScale));
+  Type *ShadowPtrTy = PointerType::get(ShadowTy, 0);
+  Value *ShadowPtr = memToShadow(AddrLong, IRB);
+  Value *CmpVal = Constant::getNullValue(ShadowTy);
+  Value *ShadowValue = IRB.CreateLoad(
+      IRB.CreateIntToPtr(ShadowPtr, ShadowPtrTy));
+
+  Value *Cmp = IRB.CreateICmpNE(ShadowValue, CmpVal);
+
+  Instruction *CheckTerm = splitBlockAndInsertIfThen(
+      cast<Instruction>(Cmp)->getNextNode(), Cmp);
+  IRBuilder<> IRB2(CheckTerm);
+
+  size_t Granularity = 1 << MappingScale;
+  if (TypeSize < 8 * Granularity) {
+    // Addr & (Granularity - 1)
+    Value *Lower3Bits = IRB2.CreateAnd(
+        AddrLong, ConstantInt::get(IntptrTy, Granularity - 1));
+    // (Addr & (Granularity - 1)) + size - 1
+    Value *LastAccessedByte = IRB2.CreateAdd(
+        Lower3Bits, ConstantInt::get(IntptrTy, TypeSize / 8 - 1));
+    // (uint8_t) ((Addr & (Granularity-1)) + size - 1)
+    LastAccessedByte = IRB2.CreateIntCast(
+        LastAccessedByte, IRB.getInt8Ty(), false);
+    // ((uint8_t) ((Addr & (Granularity-1)) + size - 1)) >= ShadowValue
+    Value *Cmp2 = IRB2.CreateICmpSGE(LastAccessedByte, ShadowValue);
+
+    CheckTerm = splitBlockAndInsertIfThen(CheckTerm, Cmp2);
+  }
+
+  IRBuilder<> IRB1(CheckTerm);
+  Instruction *Crash = generateCrashCode(IRB1, AddrLong, IsWrite, TypeSize);
+  Crash->setDebugLoc(OrigIns->getDebugLoc());
+}
+
+// This function replaces all global variables with new variables that have
+// trailing redzones. It also creates a function that poisons
+// redzones and inserts this function into llvm.global_ctors.
+bool AddressSanitizer::insertGlobalRedzones(Module &M) {
+  SmallVector<GlobalVariable *, 16> GlobalsToChange;
+
+  for (Module::GlobalListType::iterator G = M.getGlobalList().begin(),
+       E = M.getGlobalList().end(); G != E; ++G) {
+    Type *Ty = cast<PointerType>(G->getType())->getElementType();
+    DEBUG(dbgs() << "GLOBAL: " << *G);
+
+    if (!Ty->isSized()) continue;
+    if (!G->hasInitializer()) continue;
+    // Touch only those globals that will not be defined in other modules.
+    // Don't handle ODR type linkages since other modules may be built w/o asan.
+    if (G->getLinkage() != GlobalVariable::ExternalLinkage &&
+        G->getLinkage() != GlobalVariable::PrivateLinkage &&
+        G->getLinkage() != GlobalVariable::InternalLinkage)
+      continue;
+    // Two problems with thread-locals:
+    //   - The address of the main thread's copy can't be computed at link-time.
+    //   - Need to poison all copies, not just the main thread's one.
+    if (G->isThreadLocal())
+      continue;
+    // For now, just ignore this Alloca if the alignment is large.
+    if (G->getAlignment() > RedzoneSize) continue;
+
+    // Ignore all the globals with the names starting with "\01L_OBJC_".
+    // Many of those are put into the .cstring section. The linker compresses
+    // that section by removing the spare \0s after the string terminator, so
+    // our redzones get broken.
+    if ((G->getName().find("\01L_OBJC_") == 0) ||
+        (G->getName().find("\01l_OBJC_") == 0)) {
+      DEBUG(dbgs() << "Ignoring \\01L_OBJC_* global: " << *G);
+      continue;
+    }
+
+    // Ignore the globals from the __OBJC section. The ObjC runtime assumes
+    // those conform to /usr/lib/objc/runtime.h, so we can't add redzones to
+    // them.
+    if (G->hasSection()) {
+      StringRef Section(G->getSection());
+      if ((Section.find("__OBJC,") == 0) ||
+          (Section.find("__DATA, __objc_") == 0)) {
+        DEBUG(dbgs() << "Ignoring ObjC runtime global: " << *G);
+        continue;
+      }
+    }
+
+    GlobalsToChange.push_back(G);
+  }
+
+  size_t n = GlobalsToChange.size();
+  if (n == 0) return false;
+
+  // A global is described by a structure
+  //   size_t beg;
+  //   size_t size;
+  //   size_t size_with_redzone;
+  //   const char *name;
+  // We initialize an array of such structures and pass it to a run-time call.
+  StructType *GlobalStructTy = StructType::get(IntptrTy, IntptrTy,
+                                               IntptrTy, IntptrTy, NULL);
+  SmallVector<Constant *, 16> Initializers(n);
+
+  IRBuilder<> IRB(CtorInsertBefore);
+
+  for (size_t i = 0; i < n; i++) {
+    GlobalVariable *G = GlobalsToChange[i];
+    PointerType *PtrTy = cast<PointerType>(G->getType());
+    Type *Ty = PtrTy->getElementType();
+    uint64_t SizeInBytes = TD->getTypeStoreSizeInBits(Ty) / 8;
+    uint64_t RightRedzoneSize = RedzoneSize +
+        (RedzoneSize - (SizeInBytes % RedzoneSize));
+    Type *RightRedZoneTy = ArrayType::get(IRB.getInt8Ty(), RightRedzoneSize);
+
+    StructType *NewTy = StructType::get(Ty, RightRedZoneTy, NULL);
+    Constant *NewInitializer = ConstantStruct::get(
+        NewTy, G->getInitializer(),
+        Constant::getNullValue(RightRedZoneTy), NULL);
+
+    GlobalVariable *Name = createPrivateGlobalForString(M, G->getName());
+
+    // Create a new global variable with enough space for a redzone.
+    GlobalVariable *NewGlobal = new GlobalVariable(
+        M, NewTy, G->isConstant(), G->getLinkage(),
+        NewInitializer, "", G, G->isThreadLocal());
+    NewGlobal->copyAttributesFrom(G);
+    NewGlobal->setAlignment(RedzoneSize);
+
+    Value *Indices2[2];
+    Indices2[0] = IRB.getInt32(0);
+    Indices2[1] = IRB.getInt32(0);
+
+    G->replaceAllUsesWith(
+        ConstantExpr::getGetElementPtr(NewGlobal, Indices2, 2));
+    NewGlobal->takeName(G);
+    G->eraseFromParent();
+
+    Initializers[i] = ConstantStruct::get(
+        GlobalStructTy,
+        ConstantExpr::getPointerCast(NewGlobal, IntptrTy),
+        ConstantInt::get(IntptrTy, SizeInBytes),
+        ConstantInt::get(IntptrTy, SizeInBytes + RightRedzoneSize),
+        ConstantExpr::getPointerCast(Name, IntptrTy),
+        NULL);
+    DEBUG(dbgs() << "NEW GLOBAL:\n" << *NewGlobal);
+  }
+
+  ArrayType *ArrayOfGlobalStructTy = ArrayType::get(GlobalStructTy, n);
+  GlobalVariable *AllGlobals = new GlobalVariable(
+      M, ArrayOfGlobalStructTy, false, GlobalVariable::PrivateLinkage,
+      ConstantArray::get(ArrayOfGlobalStructTy, Initializers), "");
+
+  Function *AsanRegisterGlobals = cast<Function>(M.getOrInsertFunction(
+      kAsanRegisterGlobalsName, IRB.getVoidTy(), IntptrTy, IntptrTy, NULL));
+  AsanRegisterGlobals->setLinkage(Function::ExternalLinkage);
+
+  IRB.CreateCall2(AsanRegisterGlobals,
+                  IRB.CreatePointerCast(AllGlobals, IntptrTy),
+                  ConstantInt::get(IntptrTy, n));
+
+  DEBUG(dbgs() << M);
+  return true;
+}
+
+// virtual
+bool AddressSanitizer::runOnModule(Module &M) {
+  // Initialize the private fields. No one has accessed them before.
+  TD = getAnalysisIfAvailable<TargetData>();
+  if (!TD)
+    return false;
+  BL.reset(new BlackList(ClBlackListFile));
+
+  CurrentModule = &M;
+  C = &(M.getContext());
+  LongSize = TD->getPointerSizeInBits();
+  IntptrTy = Type::getIntNTy(*C, LongSize);
+  IntptrPtrTy = PointerType::get(IntptrTy, 0);
+
+  AsanCtorFunction = Function::Create(
+      FunctionType::get(Type::getVoidTy(*C), false),
+      GlobalValue::InternalLinkage, kAsanModuleCtorName, &M);
+  BasicBlock *AsanCtorBB = BasicBlock::Create(*C, "", AsanCtorFunction);
+  CtorInsertBefore = ReturnInst::Create(*C, AsanCtorBB);
+
+  // call __asan_init in the module ctor.
+  IRBuilder<> IRB(CtorInsertBefore);
+  AsanInitFunction = cast<Function>(
+      M.getOrInsertFunction(kAsanInitName, IRB.getVoidTy(), NULL));
+  AsanInitFunction->setLinkage(Function::ExternalLinkage);
+  IRB.CreateCall(AsanInitFunction);
+
+  MappingOffset = LongSize == 32
+      ? kDefaultShadowOffset32 : kDefaultShadowOffset64;
+  if (ClMappingOffsetLog >= 0) {
+    if (ClMappingOffsetLog == 0) {
+      // special case
+      MappingOffset = 0;
+    } else {
+      MappingOffset = 1ULL << ClMappingOffsetLog;
+    }
+  }
+  MappingScale = kDefaultShadowScale;
+  if (ClMappingScale) {
+    MappingScale = ClMappingScale;
+  }
+  // Redzone used for stack and globals is at least 32 bytes.
+  // For scales 6 and 7, the redzone has to be 64 and 128 bytes respectively.
+  RedzoneSize = std::max(32, (int)(1 << MappingScale));
+
+  bool Res = false;
+
+  if (ClGlobals)
+    Res |= insertGlobalRedzones(M);
+
+  // Tell the run-time the current values of mapping offset and scale.
+  GlobalValue *asan_mapping_offset =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                     ConstantInt::get(IntptrTy, MappingOffset),
+                     kAsanMappingOffsetName);
+  GlobalValue *asan_mapping_scale =
+      new GlobalVariable(M, IntptrTy, true, GlobalValue::LinkOnceODRLinkage,
+                         ConstantInt::get(IntptrTy, MappingScale),
+                         kAsanMappingScaleName);
+  // Read these globals, otherwise they may be optimized away.
+  IRB.CreateLoad(asan_mapping_scale, true);
+  IRB.CreateLoad(asan_mapping_offset, true);
+
+
+  for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
+    if (F->isDeclaration()) continue;
+    Res |= handleFunction(M, *F);
+  }
+
+  appendToGlobalCtors(M, AsanCtorFunction, 1 /*high priority*/);
+
+  return Res;
+}
+
+bool AddressSanitizer::handleFunction(Module &M, Function &F) {
+  if (BL->isIn(F)) return false;
+  if (&F == AsanCtorFunction) return false;
+
+  if (!ClDebugFunc.empty() && ClDebugFunc != F.getName())
+    return false;
+  // We want to instrument every address only once per basic block
+  // (unless there are calls between uses).
+  SmallSet<Value*, 16> TempsToInstrument;
+  SmallVector<Instruction*, 16> ToInstrument;
+
+  // Fill the set of memory operations to instrument.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    TempsToInstrument.clear();
+    for (BasicBlock::iterator BI = FI->begin(), BE = FI->end();
+         BI != BE; ++BI) {
+      if ((isa<LoadInst>(BI) && ClInstrumentReads) ||
+          (isa<StoreInst>(BI) && ClInstrumentWrites)) {
+        Value *Addr = getLDSTOperand(BI);
+        if (ClOpt && ClOptSameTemp) {
+          if (!TempsToInstrument.insert(Addr))
+            continue;  // We've seen this temp in the current BB.
+        }
+      } else if (isa<MemIntrinsic>(BI) && ClMemIntrin) {
+        // ok, take it.
+      } else {
+        if (isa<CallInst>(BI)) {
+          // A call inside BB.
+          TempsToInstrument.clear();
+        }
+        continue;
+      }
+      ToInstrument.push_back(BI);
+    }
+  }
+
+  // Instrument.
+  int NumInstrumented = 0;
+  for (size_t i = 0, n = ToInstrument.size(); i != n; i++) {
+    Instruction *Inst = ToInstrument[i];
+    if (ClDebugMin < 0 || ClDebugMax < 0 ||
+        (NumInstrumented >= ClDebugMin && NumInstrumented <= ClDebugMax)) {
+      if (isa<StoreInst>(Inst) || isa<LoadInst>(Inst))
+        instrumentMop(Inst);
+      else
+        instrumentMemIntrinsic(cast<MemIntrinsic>(Inst));
+    }
+    NumInstrumented++;
+  }
+
+  DEBUG(dbgs() << F);
+
+  bool ChangedStack = poisonStackInFunction(M, F);
+
+  // For each NSObject descendant having a +load method, this method is invoked
+  // by the ObjC runtime before any of the static constructors is called.
+  // Therefore we need to instrument such methods with a call to __asan_init
+  // at the beginning in order to initialize our runtime before any access to
+  // the shadow memory.
+  // We cannot just ignore these methods, because they may call other
+  // instrumented functions.
+  if (F.getName().find(" load]") != std::string::npos) {
+    IRBuilder<> IRB(F.begin()->begin());
+    IRB.CreateCall(AsanInitFunction);
+  }
+
+  return NumInstrumented > 0 || ChangedStack;
+}
+
+static uint64_t ValueForPoison(uint64_t PoisonByte, size_t ShadowRedzoneSize) {
+  if (ShadowRedzoneSize == 1) return PoisonByte;
+  if (ShadowRedzoneSize == 2) return (PoisonByte << 8) + PoisonByte;
+  if (ShadowRedzoneSize == 4)
+    return (PoisonByte << 24) + (PoisonByte << 16) +
+        (PoisonByte << 8) + (PoisonByte);
+  assert(0 && "ShadowRedzoneSize is either 1, 2 or 4");
+  return 0;
+}
+
+static void PoisonShadowPartialRightRedzone(uint8_t *Shadow,
+                                            size_t Size,
+                                            size_t RedzoneSize,
+                                            size_t ShadowGranularity,
+                                            uint8_t Magic) {
+  for (size_t i = 0; i < RedzoneSize;
+       i+= ShadowGranularity, Shadow++) {
+    if (i + ShadowGranularity <= Size) {
+      *Shadow = 0;  // fully addressable
+    } else if (i >= Size) {
+      *Shadow = Magic;  // unaddressable
+    } else {
+      *Shadow = Size - i;  // first Size-i bytes are addressable
+    }
+  }
+}
+
+void AddressSanitizer::PoisonStack(const ArrayRef<AllocaInst*> &AllocaVec,
+                                   IRBuilder<> IRB,
+                                   Value *ShadowBase, bool DoPoison) {
+  size_t ShadowRZSize = RedzoneSize >> MappingScale;
+  assert(ShadowRZSize >= 1 && ShadowRZSize <= 4);
+  Type *RZTy = Type::getIntNTy(*C, ShadowRZSize * 8);
+  Type *RZPtrTy = PointerType::get(RZTy, 0);
+
+  Value *PoisonLeft  = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackLeftRedzoneMagic : 0LL, ShadowRZSize));
+  Value *PoisonMid   = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackMidRedzoneMagic : 0LL, ShadowRZSize));
+  Value *PoisonRight = ConstantInt::get(RZTy,
+    ValueForPoison(DoPoison ? kAsanStackRightRedzoneMagic : 0LL, ShadowRZSize));
+
+  // poison the first red zone.
+  IRB.CreateStore(PoisonLeft, IRB.CreateIntToPtr(ShadowBase, RZPtrTy));
+
+  // poison all other red zones.
+  uint64_t Pos = RedzoneSize;
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
+    AllocaInst *AI = AllocaVec[i];
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    uint64_t AlignedSize = getAlignedAllocaSize(AI);
+    assert(AlignedSize - SizeInBytes < RedzoneSize);
+    Value *Ptr = NULL;
+
+    Pos += AlignedSize;
+
+    assert(ShadowBase->getType() == IntptrTy);
+    if (SizeInBytes < AlignedSize) {
+      // Poison the partial redzone at right
+      Ptr = IRB.CreateAdd(
+          ShadowBase, ConstantInt::get(IntptrTy,
+                                       (Pos >> MappingScale) - ShadowRZSize));
+      size_t AddressableBytes = RedzoneSize - (AlignedSize - SizeInBytes);
+      uint32_t Poison = 0;
+      if (DoPoison) {
+        PoisonShadowPartialRightRedzone((uint8_t*)&Poison, AddressableBytes,
+                                        RedzoneSize,
+                                        1ULL << MappingScale,
+                                        kAsanStackPartialRedzoneMagic);
+      }
+      Value *PartialPoison = ConstantInt::get(RZTy, Poison);
+      IRB.CreateStore(PartialPoison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
+    }
+
+    // Poison the full redzone at right.
+    Ptr = IRB.CreateAdd(ShadowBase,
+                        ConstantInt::get(IntptrTy, Pos >> MappingScale));
+    Value *Poison = i == AllocaVec.size() - 1 ? PoisonRight : PoisonMid;
+    IRB.CreateStore(Poison, IRB.CreateIntToPtr(Ptr, RZPtrTy));
+
+    Pos += RedzoneSize;
+  }
+}
+
+// Workaround for bug 11395: we don't want to instrument stack in functions
+// with large assembly blobs (32-bit only), otherwise reg alloc may crash.
+// FIXME: remove once the bug 11395 is fixed.
+bool AddressSanitizer::LooksLikeCodeInBug11395(Instruction *I) {
+  if (LongSize != 32) return false;
+  CallInst *CI = dyn_cast<CallInst>(I);
+  if (!CI || !CI->isInlineAsm()) return false;
+  if (CI->getNumArgOperands() <= 5) return false;
+  // We have inline assembly with quite a few arguments.
+  return true;
+}
+
+// Find all static Alloca instructions and put
+// poisoned red zones around all of them.
+// Then unpoison everything back before the function returns.
+//
+// Stack poisoning does not play well with exception handling.
+// When an exception is thrown, we essentially bypass the code
+// that unpoisones the stack. This is why the run-time library has
+// to intercept __cxa_throw (as well as longjmp, etc) and unpoison the entire
+// stack in the interceptor. This however does not work inside the
+// actual function which catches the exception. Most likely because the
+// compiler hoists the load of the shadow value somewhere too high.
+// This causes asan to report a non-existing bug on 453.povray.
+// It sounds like an LLVM bug.
+bool AddressSanitizer::poisonStackInFunction(Module &M, Function &F) {
+  if (!ClStack) return false;
+  SmallVector<AllocaInst*, 16> AllocaVec;
+  SmallVector<Instruction*, 8> RetVec;
+  uint64_t TotalSize = 0;
+
+  // Filter out Alloca instructions we want (and can) handle.
+  // Collect Ret instructions.
+  for (Function::iterator FI = F.begin(), FE = F.end();
+       FI != FE; ++FI) {
+    BasicBlock &BB = *FI;
+    for (BasicBlock::iterator BI = BB.begin(), BE = BB.end();
+         BI != BE; ++BI) {
+      if (LooksLikeCodeInBug11395(BI)) return false;
+      if (isa<ReturnInst>(BI)) {
+          RetVec.push_back(BI);
+          continue;
+      }
+
+      AllocaInst *AI = dyn_cast<AllocaInst>(BI);
+      if (!AI) continue;
+      if (AI->isArrayAllocation()) continue;
+      if (!AI->isStaticAlloca()) continue;
+      if (!AI->getAllocatedType()->isSized()) continue;
+      if (AI->getAlignment() > RedzoneSize) continue;
+      AllocaVec.push_back(AI);
+      uint64_t AlignedSize =  getAlignedAllocaSize(AI);
+      TotalSize += AlignedSize;
+    }
+  }
+
+  if (AllocaVec.empty()) return false;
+
+  uint64_t LocalStackSize = TotalSize + (AllocaVec.size() + 1) * RedzoneSize;
+
+  bool DoStackMalloc = ClUseAfterReturn
+      && LocalStackSize <= kMaxStackMallocSize;
+
+  Instruction *InsBefore = AllocaVec[0];
+  IRBuilder<> IRB(InsBefore);
+
+
+  Type *ByteArrayTy = ArrayType::get(IRB.getInt8Ty(), LocalStackSize);
+  AllocaInst *MyAlloca =
+      new AllocaInst(ByteArrayTy, "MyAlloca", InsBefore);
+  MyAlloca->setAlignment(RedzoneSize);
+  assert(MyAlloca->isStaticAlloca());
+  Value *OrigStackBase = IRB.CreatePointerCast(MyAlloca, IntptrTy);
+  Value *LocalStackBase = OrigStackBase;
+
+  if (DoStackMalloc) {
+    Value *AsanStackMallocFunc = M.getOrInsertFunction(
+        kAsanStackMallocName, IntptrTy, IntptrTy, IntptrTy, NULL);
+    LocalStackBase = IRB.CreateCall2(AsanStackMallocFunc,
+        ConstantInt::get(IntptrTy, LocalStackSize), OrigStackBase);
+  }
+
+  // This string will be parsed by the run-time (DescribeStackAddress).
+  SmallString<2048> StackDescriptionStorage;
+  raw_svector_ostream StackDescription(StackDescriptionStorage);
+  StackDescription << F.getName() << " " << AllocaVec.size() << " ";
+
+  uint64_t Pos = RedzoneSize;
+  // Replace Alloca instructions with base+offset.
+  for (size_t i = 0, n = AllocaVec.size(); i < n; i++) {
+    AllocaInst *AI = AllocaVec[i];
+    uint64_t SizeInBytes = getAllocaSizeInBytes(AI);
+    StringRef Name = AI->getName();
+    StackDescription << Pos << " " << SizeInBytes << " "
+                     << Name.size() << " " << Name << " ";
+    uint64_t AlignedSize = getAlignedAllocaSize(AI);
+    assert((AlignedSize % RedzoneSize) == 0);
+    AI->replaceAllUsesWith(
+        IRB.CreateIntToPtr(
+            IRB.CreateAdd(LocalStackBase, ConstantInt::get(IntptrTy, Pos)),
+            AI->getType()));
+    Pos += AlignedSize + RedzoneSize;
+  }
+  assert(Pos == LocalStackSize);
+
+  // Write the Magic value and the frame description constant to the redzone.
+  Value *BasePlus0 = IRB.CreateIntToPtr(LocalStackBase, IntptrPtrTy);
+  IRB.CreateStore(ConstantInt::get(IntptrTy, kCurrentStackFrameMagic),
+                  BasePlus0);
+  Value *BasePlus1 = IRB.CreateAdd(LocalStackBase,
+                                   ConstantInt::get(IntptrTy, LongSize/8));
+  BasePlus1 = IRB.CreateIntToPtr(BasePlus1, IntptrPtrTy);
+  Value *Description = IRB.CreatePointerCast(
+      createPrivateGlobalForString(M, StackDescription.str()),
+      IntptrTy);
+  IRB.CreateStore(Description, BasePlus1);
+
+  // Poison the stack redzones at the entry.
+  Value *ShadowBase = memToShadow(LocalStackBase, IRB);
+  PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRB, ShadowBase, true);
+
+  Value *AsanStackFreeFunc = NULL;
+  if (DoStackMalloc) {
+    AsanStackFreeFunc = M.getOrInsertFunction(
+        kAsanStackFreeName, IRB.getVoidTy(),
+        IntptrTy, IntptrTy, IntptrTy, NULL);
+  }
+
+  // Unpoison the stack before all ret instructions.
+  for (size_t i = 0, n = RetVec.size(); i < n; i++) {
+    Instruction *Ret = RetVec[i];
+    IRBuilder<> IRBRet(Ret);
+
+    // Mark the current frame as retired.
+    IRBRet.CreateStore(ConstantInt::get(IntptrTy, kRetiredStackFrameMagic),
+                       BasePlus0);
+    // Unpoison the stack.
+    PoisonStack(ArrayRef<AllocaInst*>(AllocaVec), IRBRet, ShadowBase, false);
+
+    if (DoStackMalloc) {
+      IRBRet.CreateCall3(AsanStackFreeFunc, LocalStackBase,
+                         ConstantInt::get(IntptrTy, LocalStackSize),
+                         OrigStackBase);
+    }
+  }
+
+  if (ClDebugStack) {
+    DEBUG(dbgs() << F);
+  }
+
+  return true;
+}
+
+BlackList::BlackList(const std::string &Path) {
+  Functions = NULL;
+  const char *kFunPrefix = "fun:";
+  if (!ClBlackListFile.size()) return;
+  std::string Fun;
+
+  OwningPtr<MemoryBuffer> File;
+  if (error_code EC = MemoryBuffer::getFile(ClBlackListFile.c_str(), File)) {
+    errs() << EC.message();
+    exit(1);
+  }
+  MemoryBuffer *Buff = File.take();
+  const char *Data = Buff->getBufferStart();
+  size_t DataLen = Buff->getBufferSize();
+  SmallVector<StringRef, 16> Lines;
+  SplitString(StringRef(Data, DataLen), Lines, "\n\r");
+  for (size_t i = 0, numLines = Lines.size(); i < numLines; i++) {
+    if (Lines[i].startswith(kFunPrefix)) {
+      std::string ThisFunc = Lines[i].substr(strlen(kFunPrefix));
+      if (Fun.size()) {
+        Fun += "|";
+      }
+      // add ThisFunc replacing * with .*
+      for (size_t j = 0, n = ThisFunc.size(); j < n; j++) {
+        if (ThisFunc[j] == '*')
+          Fun += '.';
+        Fun += ThisFunc[j];
+      }
+    }
+  }
+  if (Fun.size()) {
+    Functions = new Regex(Fun);
+  }
+}
+
+bool BlackList::isIn(const Function &F) {
+  if (Functions) {
+    bool Res = Functions->match(F.getName());
+    return Res;
+  }
+  return false;
+}
diff --git a/lib/Transforms/Instrumentation/CMakeLists.txt b/lib/Transforms/Instrumentation/CMakeLists.txt
index 7b3a927..929b7cd 100644
--- a/lib/Transforms/Instrumentation/CMakeLists.txt
+++ b/lib/Transforms/Instrumentation/CMakeLists.txt
@@ -1,4 +1,5 @@
 add_llvm_library(LLVMInstrumentation
+  AddressSanitizer.cpp
   EdgeProfiling.cpp
   GCOVProfiling.cpp
   Instrumentation.cpp
diff --git a/lib/Transforms/Instrumentation/Instrumentation.cpp b/lib/Transforms/Instrumentation/Instrumentation.cpp
index 71adc1e..6d6e0ae 100644
--- a/lib/Transforms/Instrumentation/Instrumentation.cpp
+++ b/lib/Transforms/Instrumentation/Instrumentation.cpp
@@ -24,6 +24,7 @@ void llvm::initializeInstrumentation(PassRegistry &Registry) {
   initializeOptimalEdgeProfilerPass(Registry);
   initializePathProfilerPass(Registry);
   initializeGCOVProfilerPass(Registry);
+  initializeAddressSanitizerPass(Registry);
 }
 
 /// LLVMInitializeInstrumentation - C binding for
diff --git a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
index 62c21b8..1fe1254 100644
--- a/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
+++ b/lib/Transforms/Instrumentation/OptimalEdgeProfiling.cpp
@@ -69,7 +69,7 @@ inline static void printEdgeCounter(ProfileInfo::Edge e,
                                     BasicBlock* b,
                                     unsigned i) {
   DEBUG(dbgs() << "--Edge Counter for " << (e) << " in " \
-               << ((b)?(b)->getNameStr():"0") << " (# " << (i) << ")\n");
+               << ((b)?(b)->getName():"0") << " (# " << (i) << ")\n");
 }
 
 bool OptimalEdgeProfiler::runOnModule(Module &M) {
@@ -127,7 +127,7 @@ bool OptimalEdgeProfiler::runOnModule(Module &M) {
   unsigned i = 0;
   for (Module::iterator F = M.begin(), E = M.end(); F != E; ++F) {
     if (F->isDeclaration()) continue;
-    DEBUG(dbgs() << "Working on " << F->getNameStr() << "\n");
+    DEBUG(dbgs() << "Working on " << F->getName() << "\n");
 
     // Calculate a Maximum Spanning Tree with the edge weights determined by
     // ProfileEstimator. ProfileEstimator also assign weights to the virtual
diff --git a/lib/Transforms/Instrumentation/PathProfiling.cpp b/lib/Transforms/Instrumentation/PathProfiling.cpp
index 23915d3..b214796 100644
--- a/lib/Transforms/Instrumentation/PathProfiling.cpp
+++ b/lib/Transforms/Instrumentation/PathProfiling.cpp
@@ -665,7 +665,7 @@ void BLInstrumentationDag::unlinkPhony() {
 // Generate a .dot graph to represent the DAG and pathNumbers
 void BLInstrumentationDag::generateDotGraph() {
   std::string errorInfo;
-  std::string functionName = getFunction().getNameStr();
+  std::string functionName = getFunction().getName().str();
   std::string filename = "pathdag." + functionName + ".dot";
 
   DEBUG (dbgs() << "Writing '" << filename << "'...\n");
@@ -750,7 +750,8 @@ Value* BLInstrumentationNode::getStartingPathNumber(){
 // Sets the Value of the pathNumber.  Used by the instrumentation code.
 void BLInstrumentationNode::setStartingPathNumber(Value* pathNumber) {
   DEBUG(dbgs() << "  SPN-" << getName() << " <-- " << (pathNumber ?
-                                                       pathNumber->getNameStr() : "unused") << "\n");
+                                                       pathNumber->getName() :
+                                                       "unused") << "\n");
   _startingPathNumber = pathNumber;
 }
 
@@ -760,7 +761,7 @@ Value* BLInstrumentationNode::getEndingPathNumber(){
 
 void BLInstrumentationNode::setEndingPathNumber(Value* pathNumber) {
   DEBUG(dbgs() << "  EPN-" << getName() << " <-- "
-        << (pathNumber ? pathNumber->getNameStr() : "unused") << "\n");
+               << (pathNumber ? pathNumber->getName() : "unused") << "\n");
   _endingPathNumber = pathNumber;
 }
 
@@ -1239,9 +1240,9 @@ void PathProfiler::insertInstrumentation(
       insertPoint++;
 
     DEBUG(dbgs() << "\nInstrumenting method call block '"
-          << node->getBlock()->getNameStr() << "'\n");
+                 << node->getBlock()->getName() << "'\n");
     DEBUG(dbgs() << "   Path number initialized: "
-          << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
+                 << ((node->getStartingPathNumber()) ? "yes" : "no") << "\n");
 
     Value* newpn;
     if( node->getStartingPathNumber() ) {
@@ -1370,7 +1371,7 @@ bool PathProfiler::runOnModule(Module &M) {
     if (F->isDeclaration())
       continue;
 
-    DEBUG(dbgs() << "Function: " << F->getNameStr() << "\n");
+    DEBUG(dbgs() << "Function: " << F->getName() << "\n");
     functionNumber++;
 
     // set function number
diff --git a/lib/Transforms/Scalar/DeadStoreElimination.cpp b/lib/Transforms/Scalar/DeadStoreElimination.cpp
index 8f5f157..f5688cb 100644
--- a/lib/Transforms/Scalar/DeadStoreElimination.cpp
+++ b/lib/Transforms/Scalar/DeadStoreElimination.cpp
@@ -274,43 +274,35 @@ static Value *getStoredPointerOperand(Instruction *I) {
   }
 }
 
-static uint64_t getPointerSize(Value *V, AliasAnalysis &AA) {
+static uint64_t getPointerSize(const Value *V, AliasAnalysis &AA) {
   const TargetData *TD = AA.getTargetData();
 
-  if (CallInst *CI = dyn_cast<CallInst>(V)) {
-    assert(isMalloc(CI) && "Expected Malloc call!");
-    if (ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
+  if (const CallInst *CI = extractMallocCall(V)) {
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(CI->getArgOperand(0)))
       return C->getZExtValue();
-    return AliasAnalysis::UnknownSize;
   }
 
   if (TD == 0)
     return AliasAnalysis::UnknownSize;
 
-  if (AllocaInst *A = dyn_cast<AllocaInst>(V)) {
+  if (const AllocaInst *A = dyn_cast<AllocaInst>(V)) {
     // Get size information for the alloca
-    if (ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
+    if (const ConstantInt *C = dyn_cast<ConstantInt>(A->getArraySize()))
       return C->getZExtValue() * TD->getTypeAllocSize(A->getAllocatedType());
-    return AliasAnalysis::UnknownSize;
   }
 
-  assert(isa<Argument>(V) && "Expected AllocaInst, malloc call or Argument!");
-  PointerType *PT = cast<PointerType>(V->getType());
-  return TD->getTypeAllocSize(PT->getElementType());
-}
+  if (const Argument *A = dyn_cast<Argument>(V)) {
+    if (A->hasByValAttr())
+      if (PointerType *PT = dyn_cast<PointerType>(A->getType()))
+        return TD->getTypeAllocSize(PT->getElementType());
+  }
 
-/// isObjectPointerWithTrustworthySize - Return true if the specified Value* is
-/// pointing to an object with a pointer size we can trust.
-static bool isObjectPointerWithTrustworthySize(const Value *V) {
-  if (const AllocaInst *AI = dyn_cast<AllocaInst>(V))
-    return !AI->isArrayAllocation();
-  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V))
-    return !GV->mayBeOverridden();
-  if (const Argument *A = dyn_cast<Argument>(V))
-    return A->hasByValAttr();
-  if (isMalloc(V))
-    return true;
-  return false;
+  if (const GlobalVariable *GV = dyn_cast<GlobalVariable>(V)) {
+    if (!GV->mayBeOverridden())
+      return TD->getTypeAllocSize(GV->getType()->getElementType());
+  }
+
+  return AliasAnalysis::UnknownSize;
 }
 
 namespace {
@@ -329,8 +321,8 @@ namespace {
 static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
                                    const AliasAnalysis::Location &Earlier,
                                    AliasAnalysis &AA,
-                                   int64_t& EarlierOff,
-                                   int64_t& LaterOff) {
+                                   int64_t &EarlierOff,
+                                   int64_t &LaterOff) {
   const Value *P1 = Earlier.Ptr->stripPointerCasts();
   const Value *P2 = Later.Ptr->stripPointerCasts();
 
@@ -377,12 +369,10 @@ static OverwriteResult isOverwrite(const AliasAnalysis::Location &Later,
     return OverwriteUnknown;
 
   // If the "Later" store is to a recognizable object, get its size.
-  if (isObjectPointerWithTrustworthySize(UO2)) {
-    uint64_t ObjectSize =
-      TD.getTypeAllocSize(cast<PointerType>(UO2->getType())->getElementType());
+  uint64_t ObjectSize = getPointerSize(UO2, AA);
+  if (ObjectSize != AliasAnalysis::UnknownSize)
     if (ObjectSize == Later.Size && ObjectSize >= Earlier.Size)
       return OverwriteComplete;
-  }
 
   // Okay, we have stores to two completely different pointers.  Try to
   // decompose the pointer into a "base + constant_offset" form.  If the base
diff --git a/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 3e122c2..4ae51d5 100644
--- a/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -3827,8 +3827,8 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
       // Remove any extra phis created by processing inner loops.
       SmallVector<WeakVH, 16> DeadInsts;
       SCEVExpander Rewriter(SE, "lsr");
-      Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
-      Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+      Changed |= (bool)Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+      Changed |= (bool)DeleteTriviallyDeadInstructions(DeadInsts);
     }
     DEBUG(dbgs() << "LSR skipping outer loop " << *L << "\n");
     return;
@@ -3880,8 +3880,8 @@ LSRInstance::LSRInstance(const TargetLowering *tli, Loop *l, Pass *P)
     // Remove any extra phis created by processing inner loops.
     SmallVector<WeakVH, 16> DeadInsts;
     SCEVExpander Rewriter(SE, "lsr");
-    Changed |= Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
-    Changed |= DeleteTriviallyDeadInstructions(DeadInsts);
+    Changed |= (bool)Rewriter.replaceCongruentIVs(L, &DT, DeadInsts);
+    Changed |= (bool)DeleteTriviallyDeadInstructions(DeadInsts);
   }
 }
 
diff --git a/lib/Transforms/Scalar/MemCpyOptimizer.cpp b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
index 298d692..9e4f51f 100644
--- a/lib/Transforms/Scalar/MemCpyOptimizer.cpp
+++ b/lib/Transforms/Scalar/MemCpyOptimizer.cpp
@@ -950,7 +950,7 @@ bool MemCpyOpt::iterateOnFunction(Function &F) {
         RepeatInstruction = processMemMove(M);
       else if (CallSite CS = (Value*)I) {
         for (unsigned i = 0, e = CS.arg_size(); i != e; ++i)
-          if (CS.paramHasAttr(i+1, Attribute::ByVal))
+          if (CS.isByValArgument(i))
             MadeChange |= processByValArgument(CS, i);
       }
 
diff --git a/lib/Transforms/Scalar/ScalarReplAggregates.cpp b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
index c12f403..4b14efc 100644
--- a/lib/Transforms/Scalar/ScalarReplAggregates.cpp
+++ b/lib/Transforms/Scalar/ScalarReplAggregates.cpp
@@ -2534,13 +2534,12 @@ isOnlyCopiedFromConstantGlobal(Value *V, MemTransferInst *&TheCopy,
       // ignore it if we know that the value isn't captured.
       unsigned ArgNo = CS.getArgumentNo(UI);
       if (CS.onlyReadsMemory() &&
-          (CS.getInstruction()->use_empty() ||
-           CS.paramHasAttr(ArgNo+1, Attribute::NoCapture)))
+          (CS.getInstruction()->use_empty() || CS.doesNotCapture(ArgNo)))
         continue;
 
       // If this is being passed as a byval argument, the caller is making a
       // copy, so it is only a read of the alloca.
-      if (CS.paramHasAttr(ArgNo+1, Attribute::ByVal))
+      if (CS.isByValArgument(ArgNo))
         continue;
     }
 
diff --git a/lib/Transforms/Scalar/SimplifyLibCalls.cpp b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
index 2a00ae1..6e169de 100644
--- a/lib/Transforms/Scalar/SimplifyLibCalls.cpp
+++ b/lib/Transforms/Scalar/SimplifyLibCalls.cpp
@@ -963,8 +963,7 @@ struct UnaryDoubleFPOpt : public LibCallOptimization {
 
     // floor((double)floatval) -> (double)floorf(floatval)
     Value *V = Cast->getOperand(0);
-    V = EmitUnaryFloatFnCall(V, Callee->getName().data(), B,
-                             Callee->getAttributes());
+    V = EmitUnaryFloatFnCall(V, Callee->getName(), B, Callee->getAttributes());
     return B.CreateFPExt(V, B.getDoubleTy());
   }
 };
@@ -1324,7 +1323,7 @@ struct FPutsOpt : public LibCallOptimization {
     if (!Len) return 0;
     EmitFWrite(CI->getArgOperand(0),
                ConstantInt::get(TD->getIntPtrType(*Context), Len-1),
-               CI->getArgOperand(1), B, TD);
+               CI->getArgOperand(1), B, TD, TLI);
     return CI;  // Known to have no uses (see above).
   }
 };
@@ -1352,7 +1351,7 @@ struct FPrintFOpt : public LibCallOptimization {
       EmitFWrite(CI->getArgOperand(1),
                  ConstantInt::get(TD->getIntPtrType(*Context),
                                   FormatStr.size()),
-                 CI->getArgOperand(0), B, TD);
+                 CI->getArgOperand(0), B, TD, TLI);
       return ConstantInt::get(CI->getType(), FormatStr.size());
     }
 
@@ -1374,7 +1373,7 @@ struct FPrintFOpt : public LibCallOptimization {
       // fprintf(F, "%s", str) --> fputs(str, F)
       if (!CI->getArgOperand(2)->getType()->isPointerTy() || !CI->use_empty())
         return 0;
-      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD);
+      EmitFPutS(CI->getArgOperand(2), CI->getArgOperand(0), B, TD, TLI);
       return CI;
     }
     return 0;
@@ -1470,6 +1469,7 @@ namespace {
     SimplifyLibCalls() : FunctionPass(ID), StrCpy(false), StrCpyChk(true) {
       initializeSimplifyLibCallsPass(*PassRegistry::getPassRegistry());
     }
+    void AddOpt(LibFunc::Func F, LibCallOptimization* Opt);
     void InitOptimizations();
     bool runOnFunction(Function &F);
 
@@ -1500,6 +1500,11 @@ FunctionPass *llvm::createSimplifyLibCallsPass() {
   return new SimplifyLibCalls();
 }
 
+void SimplifyLibCalls::AddOpt(LibFunc::Func F, LibCallOptimization* Opt) {
+  if (TLI->has(F))
+    Optimizations[TLI->getName(F)] = Opt;
+}
+
 /// Optimizations - Populate the Optimizations map with all the optimizations
 /// we know.
 void SimplifyLibCalls::InitOptimizations() {
@@ -1525,9 +1530,9 @@ void SimplifyLibCalls::InitOptimizations() {
   Optimizations["strcspn"] = &StrCSpn;
   Optimizations["strstr"] = &StrStr;
   Optimizations["memcmp"] = &MemCmp;
-  if (TLI->has(LibFunc::memcpy)) Optimizations["memcpy"] = &MemCpy;
+  AddOpt(LibFunc::memcpy, &MemCpy);
   Optimizations["memmove"] = &MemMove;
-  if (TLI->has(LibFunc::memset)) Optimizations["memset"] = &MemSet;
+  AddOpt(LibFunc::memset, &MemSet);
 
   // _chk variants of String and Memory LibCall Optimizations.
   Optimizations["__strcpy_chk"] = &StrCpyChk;
@@ -1580,8 +1585,8 @@ void SimplifyLibCalls::InitOptimizations() {
   // Formatting and IO Optimizations
   Optimizations["sprintf"] = &SPrintF;
   Optimizations["printf"] = &PrintF;
-  Optimizations["fwrite"] = &FWrite;
-  Optimizations["fputs"] = &FPuts;
+  AddOpt(LibFunc::fwrite, &FWrite);
+  AddOpt(LibFunc::fputs, &FPuts);
   Optimizations["fprintf"] = &FPrintF;
   Optimizations["puts"] = &Puts;
 }
diff --git a/lib/Transforms/Utils/BuildLibCalls.cpp b/lib/Transforms/Utils/BuildLibCalls.cpp
index 4b5f45b..a808303 100644
--- a/lib/Transforms/Utils/BuildLibCalls.cpp
+++ b/lib/Transforms/Utils/BuildLibCalls.cpp
@@ -15,11 +15,15 @@
 #include "llvm/Type.h"
 #include "llvm/Constants.h"
 #include "llvm/Function.h"
+#include "llvm/Intrinsics.h"
+#include "llvm/LLVMContext.h"
 #include "llvm/Module.h"
 #include "llvm/Support/IRBuilder.h"
 #include "llvm/Target/TargetData.h"
+#include "llvm/Target/TargetLibraryInfo.h"
 #include "llvm/LLVMContext.h"
 #include "llvm/Intrinsics.h"
+#include "llvm/ADT/SmallString.h"
 
 using namespace llvm;
 
@@ -206,19 +210,16 @@ Value *llvm::EmitMemCmp(Value *Ptr1, Value *Ptr2,
 /// 'floor').  This function is known to take a single of type matching 'Op' and
 /// returns one value with the same type.  If 'Op' is a long double, 'l' is
 /// added as the suffix of name, if 'Op' is a float, we add a 'f' suffix.
-Value *llvm::EmitUnaryFloatFnCall(Value *Op, const char *Name,
-                                  IRBuilder<> &B, const AttrListPtr &Attrs) {
-  char NameBuffer[20];
+Value *llvm::EmitUnaryFloatFnCall(Value *Op, StringRef Name, IRBuilder<> &B,
+                                  const AttrListPtr &Attrs) {
+  SmallString<20> NameBuffer;
   if (!Op->getType()->isDoubleTy()) {
     // If we need to add a suffix, copy into NameBuffer.
-    unsigned NameLen = strlen(Name);
-    assert(NameLen < sizeof(NameBuffer)-2);
-    memcpy(NameBuffer, Name, NameLen);
+    NameBuffer += Name;
     if (Op->getType()->isFloatTy())
-      NameBuffer[NameLen] = 'f';  // floorf
+      NameBuffer += 'f'; // floorf
     else
-      NameBuffer[NameLen] = 'l';  // floorl
-    NameBuffer[NameLen+1] = 0;
+      NameBuffer += 'l'; // floorl
     Name = NameBuffer;
   }
 
@@ -299,20 +300,21 @@ void llvm::EmitFPutC(Value *Char, Value *File, IRBuilder<> &B,
 /// EmitFPutS - Emit a call to the puts function.  Str is required to be a
 /// pointer and File is a pointer to FILE.
 void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
-                     const TargetData *TD) {
+                     const TargetData *TD, const TargetLibraryInfo *TLI) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(2, Attribute::NoCapture);
   AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
+  StringRef FPutsName = TLI->getName(LibFunc::fputs);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fputs", AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FPutsName, AttrListPtr::get(AWI, 3),
                                B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
   else
-    F = M->getOrInsertFunction("fputs", B.getInt32Ty(),
+    F = M->getOrInsertFunction(FPutsName, B.getInt32Ty(),
                                B.getInt8PtrTy(),
                                File->getType(), NULL);
   CallInst *CI = B.CreateCall2(F, CastToCStr(Str, B), File, "fputs");
@@ -324,23 +326,25 @@ void llvm::EmitFPutS(Value *Str, Value *File, IRBuilder<> &B,
 /// EmitFWrite - Emit a call to the fwrite function.  This assumes that Ptr is
 /// a pointer, Size is an 'intptr_t', and File is a pointer to FILE.
 void llvm::EmitFWrite(Value *Ptr, Value *Size, Value *File,
-                      IRBuilder<> &B, const TargetData *TD) {
+                      IRBuilder<> &B, const TargetData *TD,
+                      const TargetLibraryInfo *TLI) {
   Module *M = B.GetInsertBlock()->getParent()->getParent();
   AttributeWithIndex AWI[3];
   AWI[0] = AttributeWithIndex::get(1, Attribute::NoCapture);
   AWI[1] = AttributeWithIndex::get(4, Attribute::NoCapture);
   AWI[2] = AttributeWithIndex::get(~0u, Attribute::NoUnwind);
   LLVMContext &Context = B.GetInsertBlock()->getContext();
+  StringRef FWriteName = TLI->getName(LibFunc::fwrite);
   Constant *F;
   if (File->getType()->isPointerTy())
-    F = M->getOrInsertFunction("fwrite", AttrListPtr::get(AWI, 3),
+    F = M->getOrInsertFunction(FWriteName, AttrListPtr::get(AWI, 3),
                                TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
                                TD->getIntPtrType(Context),
                                File->getType(), NULL);
   else
-    F = M->getOrInsertFunction("fwrite", TD->getIntPtrType(Context),
+    F = M->getOrInsertFunction(FWriteName, TD->getIntPtrType(Context),
                                B.getInt8PtrTy(),
                                TD->getIntPtrType(Context),
                                TD->getIntPtrType(Context),
diff --git a/lib/Transforms/Utils/InlineFunction.cpp b/lib/Transforms/Utils/InlineFunction.cpp
index 5464dbc..dd4a659 100644
--- a/lib/Transforms/Utils/InlineFunction.cpp
+++ b/lib/Transforms/Utils/InlineFunction.cpp
@@ -987,7 +987,7 @@ bool llvm::InlineFunction(CallSite CS, InlineFunctionInfo &IFI) {
       // by them explicit.  However, we don't do this if the callee is readonly
       // or readnone, because the copy would be unneeded: the callee doesn't
       // modify the struct.
-      if (CalledFunc->paramHasAttr(ArgNo+1, Attribute::ByVal)) {
+      if (CS.isByValArgument(ArgNo)) {
         ActualArg = HandleByValArgument(ActualArg, TheCall, CalledFunc, IFI,
                                         CalledFunc->getParamAlignment(ArgNo+1));
  
diff --git a/lib/Transforms/Utils/ModuleUtils.cpp b/lib/Transforms/Utils/ModuleUtils.cpp
index db81de7..5e294a3 100644
--- a/lib/Transforms/Utils/ModuleUtils.cpp
+++ b/lib/Transforms/Utils/ModuleUtils.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Function.h"
 #include "llvm/Module.h"
 #include "llvm/Support/IRBuilder.h"
+
 using namespace llvm;
 
 void llvm::appendToGlobalCtors(Module &M, Function *F, int Priority) {
diff --git a/lib/Transforms/Utils/SimplifyIndVar.cpp b/lib/Transforms/Utils/SimplifyIndVar.cpp
index 76289c0..6732a78 100644
--- a/lib/Transforms/Utils/SimplifyIndVar.cpp
+++ b/lib/Transforms/Utils/SimplifyIndVar.cpp
@@ -107,8 +107,8 @@ Value *SimplifyIndvar::foldIVUser(Instruction *UseInst, Instruction *IVOperand)
 
     // Attempt to fold a binary operator with constant operand.
     // e.g. ((I + 1) >> 2) => I >> 2
-    if (IVOperand->getNumOperands() != 2 ||
-        !isa<ConstantInt>(IVOperand->getOperand(1)))
+    if (!isa<BinaryOperator>(IVOperand)
+        || !isa<ConstantInt>(IVOperand->getOperand(1)))
       return 0;
 
     IVSrc = IVOperand->getOperand(0);
diff --git a/lib/VMCore/Pass.cpp b/lib/VMCore/Pass.cpp
index 9afc540..fca9466 100644
--- a/lib/VMCore/Pass.cpp
+++ b/lib/VMCore/Pass.cpp
@@ -25,8 +25,6 @@ using namespace llvm;
 // Pass Implementation
 //
 
-Pass::Pass(PassKind K, char &pid) : Resolver(0), PassID(&pid), Kind(K) { }
-
 // Force out-of-line virtual method.
 Pass::~Pass() { 
   delete Resolver; 
@@ -48,7 +46,7 @@ bool Pass::mustPreserveAnalysisID(char &AID) const {
   return Resolver->getAnalysisIfAvailable(&AID, true) != 0;
 }
 
-// dumpPassStructure - Implement the -debug-passes=Structure option
+// dumpPassStructure - Implement the -debug-pass=Structure option
 void Pass::dumpPassStructure(unsigned Offset) {
   dbgs().indent(Offset*2) << getPassName() << "\n";
 }
diff --git a/lib/VMCore/Value.cpp b/lib/VMCore/Value.cpp
index 2fa5f08..291df91 100644
--- a/lib/VMCore/Value.cpp
+++ b/lib/VMCore/Value.cpp
@@ -66,7 +66,7 @@ Value::~Value() {
   // a <badref>
   //
   if (!use_empty()) {
-    dbgs() << "While deleting: " << *VTy << " %" << getNameStr() << "\n";
+    dbgs() << "While deleting: " << *VTy << " %" << getName() << "\n";
     for (use_iterator I = use_begin(), E = use_end(); I != E; ++I)
       dbgs() << "Use still stuck around after Def is destroyed:"
            << **I << "\n";
@@ -156,10 +156,6 @@ StringRef Value::getName() const {
   return Name->getKey();
 }
 
-std::string Value::getNameStr() const {
-  return getName().str();
-}
-
 void Value::setName(const Twine &NewName) {
   // Fast path for common IRBuilder case of setName("") when there is no name.
   if (NewName.isTriviallyEmpty() && !hasName())
@@ -554,7 +550,7 @@ void ValueHandleBase::ValueIsDeleted(Value *V) {
   // All callbacks, weak references, and assertingVHs should be dropped by now.
   if (V->HasValueHandle) {
 #ifndef NDEBUG      // Only in +Asserts mode...
-    dbgs() << "While deleting: " << *V->getType() << " %" << V->getNameStr()
+    dbgs() << "While deleting: " << *V->getType() << " %" << V->getName()
            << "\n";
     if (pImpl->ValueHandles[V]->getKind() == Assert)
       llvm_unreachable("An asserting value handle still pointed to this"
@@ -617,8 +613,8 @@ void ValueHandleBase::ValueIsRAUWd(Value *Old, Value *New) {
       case Tracking:
       case Weak:
         dbgs() << "After RAUW from " << *Old->getType() << " %"
-          << Old->getNameStr() << " to " << *New->getType() << " %"
-          << New->getNameStr() << "\n";
+               << Old->getName() << " to " << *New->getType() << " %"
+               << New->getName() << "\n";
         llvm_unreachable("A tracking or weak value handle still pointed to the"
                          " old value!\n");
       default:
diff --git a/lib/VMCore/ValueTypes.cpp b/lib/VMCore/ValueTypes.cpp
index e13bd7d..089c259 100644
--- a/lib/VMCore/ValueTypes.cpp
+++ b/lib/VMCore/ValueTypes.cpp
@@ -139,7 +139,7 @@ std::string EVT::getEVTString() const {
   case MVT::v2f64:   return "v2f64";
   case MVT::v4f64:   return "v4f64";
   case MVT::Metadata:return "Metadata";
-  case MVT::untyped: return "untyped";
+  case MVT::Untyped: return "Untyped";
   }
 }
 
diff --git a/test/CodeGen/ARM/fast-isel-GEP-coalesce.ll b/test/CodeGen/ARM/fast-isel-GEP-coalesce.ll
new file mode 100644
index 0000000..dbb634d
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-GEP-coalesce.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=armv7-apple-darwin | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=dynamic-no-pic -mtriple=thumbv7-apple-darwin | FileCheck %s --check-prefix=THUMB
+
+%struct.A = type { i32, [2 x [2 x i32]], i8, [3 x [3 x [3 x i32]]] }
+%struct.B = type { i32, [2 x [2 x [2 x %struct.A]]] }
+
+@arr = common global [2 x [2 x [2 x [2 x [2 x i32]]]]] zeroinitializer, align 4
+@A = common global [3 x [3 x %struct.A]] zeroinitializer, align 4
+@B = common global [2 x [2 x [2 x %struct.B]]] zeroinitializer, align 4
+
+define i32* @t1() nounwind {
+entry:
+; ARM: t1
+; THUMB: t1
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([2 x [2 x [2 x [2 x [2 x i32]]]]]* @arr, i32 0, i32 1, i32 1, i32 1, i32 1, i32 1), i32** %addr, align 4
+; ARM: add r0, r0, #124
+; THUMB: adds r0, #124
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t2() nounwind {
+entry:
+; ARM: t2
+; THUMB: t2
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([3 x [3 x %struct.A]]* @A, i32 0, i32 2, i32 2, i32 3, i32 1, i32 2, i32 2), i32** %addr, align 4
+; ARM: movw r1, #1148
+; ARM: add r0, r0, r1
+; THUMB: addw r0, r0, #1148
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t3() nounwind {
+entry:
+; ARM: t3
+; THUMB: t3
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([3 x [3 x %struct.A]]* @A, i32 0, i32 0, i32 1, i32 1, i32 0, i32 1), i32** %addr, align 4
+; ARM: add r0, r0, #140
+; THUMB: adds r0, #140
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
+
+define i32* @t4() nounwind {
+entry:
+; ARM: t4
+; THUMB: t4
+  %addr = alloca i32*, align 4
+  store i32* getelementptr inbounds ([2 x [2 x [2 x %struct.B]]]* @B, i32 0, i32 0, i32 0, i32 1, i32 1, i32 0, i32 0, i32 1, i32 3, i32 1, i32 2, i32 1), i32** %addr, align 4
+; ARM-NOT: movw r{{[0-9]}}, #1060
+; ARM-NOT: add r{{[0-9]}}, r{{[0-9]}}, #4
+; ARM-NOT: add r{{[0-9]}}, r{{[0-9]}}, #132
+; ARM-NOT: add r{{[0-9]}}, r{{[0-9]}}, #24
+; ARM-NOT: add r{{[0-9]}}, r{{[0-9]}}, #36
+; ARM-NOT: add r{{[0-9]}}, r{{[0-9]}}, #24
+; ARM-NOT: add r{{[0-9]}}, r{{[0-9]}}, #4
+; ARM: movw r{{[0-9]}}, #1284
+; THUMB: addw r{{[0-9]}}, r{{[0-9]}}, #1284
+  %0 = load i32** %addr, align 4
+  ret i32* %0
+}
diff --git a/test/CodeGen/ARM/vmov.ll b/test/CodeGen/ARM/vmov.ll
index ab56e5b..0396a41 100644
--- a/test/CodeGen/ARM/vmov.ll
+++ b/test/CodeGen/ARM/vmov.ll
@@ -353,3 +353,21 @@ define void @noTruncStore(<4 x i32>* %a, <4 x i16>* %b) nounwind {
   store <4 x i16> %tmp2, <4 x i16>* %b, align 8
   ret void
 }
+
+; Use vmov.f32 to materialize f32 immediate splats
+; rdar://10437054
+define void @v_mov_v2f32(<2 x float>* nocapture %p) nounwind {
+entry:
+;CHECK: v_mov_v2f32:
+;CHECK: vmov.f32 d{{.*}}, #-1.600000e+01
+  store <2 x float> <float -1.600000e+01, float -1.600000e+01>, <2 x float>* %p, align 4
+  ret void
+}
+
+define void @v_mov_v4f32(<4 x float>* nocapture %p) nounwind {
+entry:
+;CHECK: v_mov_v4f32:
+;CHECK: vmov.f32 q{{.*}}, #3.100000e+01
+  store <4 x float> <float 3.100000e+01, float 3.100000e+01, float 3.100000e+01, float 3.100000e+01>, <4 x float>* %p, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/ppc32-vaarg.ll b/test/CodeGen/PowerPC/ppc32-vaarg.ll
index 393800b..725c106 100644
--- a/test/CodeGen/PowerPC/ppc32-vaarg.ll
+++ b/test/CodeGen/PowerPC/ppc32-vaarg.ll
@@ -12,10 +12,9 @@ target triple = "powerpc-unknown-freebsd9.0"
 define void @ppcvaargtest(%struct.__va_list_tag* %ap) nounwind {
  entry:
   %x = va_arg %struct.__va_list_tag* %ap, i64; Get from r5,r6
-; CHECK: lbz 4, 0(3)
-; CHECK-NEXT: rlwinm 5, 4, 0, 31, 31
-; CHECK-NEXT: cmplwi 0, 5, 0
-; CHECK-NEXT: addi 5, 4, 1
+; CHECK: addi 5, 4, 1
+; CHECK-NEXT: rlwinm 6, 4, 0, 31, 31
+; CHECK-NEXT: cmplwi 0, 6, 0
 ; CHECK-NEXT: stw 3, -4(1)
 ; CHECK-NEXT: stw 5, -8(1)
 ; CHECK-NEXT: stw 4, -12(1)
@@ -25,138 +24,137 @@ define void @ppcvaargtest(%struct.__va_list_tag* %ap) nounwind {
 ; CHECK-NEXT: stw 3, -8(1)
 ; CHECK-NEXT: .LBB0_2:                                # %entry
 ; CHECK-NEXT: lwz 3, -8(1)
-; CHECK-NEXT: slwi 4, 3, 2
+; CHECK-NEXT: addi 4, 3, 2
 ; CHECK-NEXT: lwz 5, -4(1)
 ; CHECK-NEXT: lwz 6, 4(5)
 ; CHECK-NEXT: lwz 7, 8(5)
-; CHECK-NEXT: add 4, 7, 4
+; CHECK-NEXT: stb 4, 0(5)
 ; CHECK-NEXT: cmpwi 0, 3, 8
+; CHECK-NEXT: addi 4, 6, 4
+; CHECK-NEXT: mr 8, 6
+; CHECK-NEXT: stw 7, -16(1)
+; CHECK-NEXT: stw 4, -20(1)
+; CHECK-NEXT: stw 3, -24(1)
+; CHECK-NEXT: stw 8, -28(1)
+; CHECK-NEXT: stw 6, -32(1)
 ; CHECK-NEXT: mfcr 0                          # cr0
-; CHECK-NEXT: stw 0, -16(1)
-; CHECK-NEXT: stw 3, -20(1)
-; CHECK-NEXT: stw 4, -24(1)
-; CHECK-NEXT: stw 6, -28(1)
+; CHECK-NEXT: stw 0, -36(1)
 ; CHECK-NEXT: blt 0, .LBB0_4
 ; CHECK-NEXT: # BB#3:                                 # %entry
-; CHECK-NEXT: lwz 3, -28(1)
-; CHECK-NEXT: stw 3, -24(1)
+; CHECK-NEXT: lwz 3, -20(1)
+; CHECK-NEXT: stw 3, -28(1)
 ; CHECK-NEXT: .LBB0_4:                                # %entry
+; CHECK-NEXT: lwz 3, -28(1)
+; CHECK-NEXT: lwz 4, -4(1)
+; CHECK-NEXT: stw 3, 4(4)
+  store i64 %x, i64* @var1, align 8
 ; CHECK-NEXT: lwz 3, -24(1)
-; CHECK-NEXT: lwz 4, -28(1)
-; CHECK-NEXT: addi 5, 4, 4
-; CHECK-NEXT: lwz 0, -16(1)
+; CHECK-NEXT: slwi 5, 3, 2
+; CHECK-NEXT: lwz 6, -16(1)
+; CHECK-NEXT: add 5, 6, 5
+; CHECK-NEXT: lwz 0, -36(1)
 ; CHECK-NEXT: mtcrf 128, 0
-; CHECK-NEXT: stw 4, -32(1)
-; CHECK-NEXT: stw 5, -36(1)
-; CHECK-NEXT: stw 3, -40(1)
+; CHECK-NEXT: stw 5, -40(1)
 ; CHECK-NEXT: blt 0, .LBB0_6
 ; CHECK-NEXT: # BB#5:                                 # %entry
-; CHECK-NEXT: lwz 3, -36(1)
-; CHECK-NEXT: stw 3, -32(1)
-; CHECK-NEXT: .LBB0_6:                                # %entry
 ; CHECK-NEXT: lwz 3, -32(1)
-; CHECK-NEXT: lwz 4, -20(1)
-; CHECK-NEXT: addi 5, 4, 2
-; CHECK-NEXT: lwz 6, -4(1)
-; CHECK-NEXT: stb 5, 0(6)
-; CHECK-NEXT: stw 3, 4(6)
-  store i64 %x, i64* @var1, align 8
+; CHECK-NEXT: stw 3, -40(1)
+; CHECK-NEXT: .LBB0_6:                                # %entry
 ; CHECK-NEXT: lwz 3, -40(1)
-; CHECK-NEXT: lwz 5, 0(3)
-; CHECK-NEXT: lwz 7, 4(3)
-; CHECK-NEXT: lis 8, var1@ha
-; CHECK-NEXT: la 9, var1@l(8)
-; CHECK-NEXT: stw 7, 4(9)
-; CHECK-NEXT: stw 5, var1@l(8)
+; CHECK-NEXT: lwz 4, 0(3)
+; CHECK-NEXT: lwz 3, 4(3)
+; CHECK-NEXT: lis 5, var1@ha
+; CHECK-NEXT: la 6, var1@l(5)
+; CHECK-NEXT: stw 3, 4(6)
+; CHECK-NEXT: stw 4, var1@l(5)
+; CHECK-NEXT: lwz 3, -4(1)
   %y = va_arg %struct.__va_list_tag* %ap, double; From f1
-; CHECK-NEXT: lbz 5, 1(6)
-; CHECK-NEXT: lwz 7, 4(6)
-; CHECK-NEXT: lwz 8, 8(6)
-; CHECK-NEXT: slwi 9, 5, 3
-; CHECK-NEXT: add 8, 8, 9
-; CHECK-NEXT: cmpwi 0, 5, 8
-; CHECK-NEXT: addi 9, 7, 8
-; CHECK-NEXT: mr 10, 7
-; CHECK-NEXT: stw 9, -44(1)
+; CHECK-NEXT: lbz 4, 1(3)
+; CHECK-NEXT: lwz 5, 4(3)
+; CHECK-NEXT: lwz 6, 8(3)
+; CHECK-NEXT: addi 7, 4, 1
+; CHECK-NEXT: stb 7, 1(3)
+; CHECK-NEXT: cmpwi 0, 4, 8
+; CHECK-NEXT: addi 7, 5, 8
+; CHECK-NEXT: mr 8, 5
+; CHECK-NEXT: stw 5, -44(1)
 ; CHECK-NEXT: stw 7, -48(1)
+; CHECK-NEXT: stw 4, -52(1)
+; CHECK-NEXT: stw 6, -56(1)
+; CHECK-NEXT: stw 8, -60(1)
 ; CHECK-NEXT: mfcr 0                          # cr0
-; CHECK-NEXT: stw 0, -52(1)
-; CHECK-NEXT: stw 5, -56(1)
-; CHECK-NEXT: stw 10, -60(1)
-; CHECK-NEXT: stw 8, -64(1)
+; CHECK-NEXT: stw 0, -64(1)
 ; CHECK-NEXT: blt 0, .LBB0_8
 ; CHECK-NEXT: # BB#7:                                 # %entry
-; CHECK-NEXT: lwz 3, -44(1)
+; CHECK-NEXT: lwz 3, -48(1)
 ; CHECK-NEXT: stw 3, -60(1)
 ; CHECK-NEXT: .LBB0_8:                                # %entry
 ; CHECK-NEXT: lwz 3, -60(1)
-; CHECK-NEXT: lwz 4, -64(1)
-; CHECK-NEXT: addi 4, 4, 32
-; CHECK-NEXT: lwz 0, -52(1)
+; CHECK-NEXT: lwz 4, -4(1)
+; CHECK-NEXT: stw 3, 4(4)
+; CHECK-NEXT: lwz 3, -52(1)
+; CHECK-NEXT: slwi 5, 3, 3
+; CHECK-NEXT: lwz 6, -56(1)
+; CHECK-NEXT: add 5, 6, 5
+; CHECK-NEXT: addi 5, 5, 32
+; CHECK-NEXT: lwz 0, -64(1)
 ; CHECK-NEXT: mtcrf 128, 0
-; CHECK-NEXT: stw 4, -68(1)
-; CHECK-NEXT: stw 3, -72(1)
+; CHECK-NEXT: stw 5, -68(1)
 ; CHECK-NEXT: blt 0, .LBB0_10
 ; CHECK-NEXT: # BB#9:                                 # %entry
-; CHECK-NEXT: lwz 3, -48(1)
+; CHECK-NEXT: lwz 3, -44(1)
 ; CHECK-NEXT: stw 3, -68(1)
 ; CHECK-NEXT: .LBB0_10:                               # %entry
 ; CHECK-NEXT: lwz 3, -68(1)
-; CHECK-NEXT: lwz 4, -56(1)
-; CHECK-NEXT: addi 5, 4, 1
-; CHECK-NEXT: lwz 6, -4(1)
-; CHECK-NEXT: stb 5, 1(6)
-; CHECK-NEXT: lwz 5, -72(1)
-; CHECK-NEXT: stw 5, 4(6)
 ; CHECK-NEXT: lfd 0, 0(3)
   store double %y, double* @var2, align 8
 ; CHECK-NEXT: lis 3, var2@ha
 ; CHECK-NEXT: stfd 0, var2@l(3)
   %z = va_arg %struct.__va_list_tag* %ap, i32; From r7
-; CHECK-NEXT: lbz 3, 0(6)
-; CHECK-NEXT: lwz 5, 4(6)
-; CHECK-NEXT: lwz 7, 8(6)
-; CHECK-NEXT: slwi 8, 3, 2
-; CHECK-NEXT: add 7, 7, 8
-; CHECK-NEXT: cmpwi 0, 3, 8
-; CHECK-NEXT: addi 8, 5, 4
-; CHECK-NEXT: mr 9, 5
-; CHECK-NEXT: stw 3, -76(1)
-; CHECK-NEXT: stw 7, -80(1)
-; CHECK-NEXT: stw 8, -84(1)
-; CHECK-NEXT: stw 5, -88(1)
-; CHECK-NEXT: stw 9, -92(1)
+; CHECK-NEXT: lwz 3, -4(1)
+; CHECK-NEXT: lbz 4, 0(3)
+; CHECK-NEXT: lwz 5, 4(3)
+; CHECK-NEXT: lwz 6, 8(3)
+; CHECK-NEXT: addi 7, 4, 1
+; CHECK-NEXT: stb 7, 0(3)
+; CHECK-NEXT: cmpwi 0, 4, 8
+; CHECK-NEXT: addi 7, 5, 4
+; CHECK-NEXT: mr 8, 5
+; CHECK-NEXT: stw 4, -72(1)
+; CHECK-NEXT: stw 6, -76(1)
 ; CHECK-NEXT: mfcr 0                          # cr0
-; CHECK-NEXT: stw 0, -96(1)
+; CHECK-NEXT: stw 0, -80(1)
+; CHECK-NEXT: stw 5, -84(1)
+; CHECK-NEXT: stw 8, -88(1)
+; CHECK-NEXT: stw 7, -92(1)
 ; CHECK-NEXT: blt 0, .LBB0_12
 ; CHECK-NEXT: # BB#11:                                # %entry
-; CHECK-NEXT: lwz 3, -84(1)
-; CHECK-NEXT: stw 3, -92(1)
-; CHECK-NEXT: .LBB0_12:                               # %entry
 ; CHECK-NEXT: lwz 3, -92(1)
-; CHECK-NEXT: lwz 4, -80(1)
-; CHECK-NEXT: lwz 0, -96(1)
+; CHECK-NEXT: stw 3, -88(1)
+; CHECK-NEXT: .LBB0_12:                               # %entry
+; CHECK-NEXT: lwz 3, -88(1)
+; CHECK-NEXT: lwz 4, -4(1)
+; CHECK-NEXT: stw 3, 4(4)
+; CHECK-NEXT: lwz 3, -72(1)
+; CHECK-NEXT: slwi 5, 3, 2
+; CHECK-NEXT: lwz 6, -76(1)
+; CHECK-NEXT: add 5, 6, 5
+; CHECK-NEXT: lwz 0, -80(1)
 ; CHECK-NEXT: mtcrf 128, 0
-; CHECK-NEXT: stw 3, -100(1)
-; CHECK-NEXT: stw 4, -104(1)
+; CHECK-NEXT: stw 5, -96(1)
 ; CHECK-NEXT: blt 0, .LBB0_14
 ; CHECK-NEXT: # BB#13:                                # %entry
-; CHECK-NEXT: lwz 3, -88(1)
-; CHECK-NEXT: stw 3, -104(1)
+; CHECK-NEXT: lwz 3, -84(1)
+; CHECK-NEXT: stw 3, -96(1)
 ; CHECK-NEXT: .LBB0_14:                               # %entry
-; CHECK-NEXT: lwz 3, -104(1)
-; CHECK-NEXT: lwz 4, -76(1)
-; CHECK-NEXT: addi 5, 4, 1
-; CHECK-NEXT: lwz 6, -4(1)
-; CHECK-NEXT: stb 5, 0(6)
-; CHECK-NEXT: lwz 5, -100(1)
-; CHECK-NEXT: stw 5, 4(6)
+; CHECK-NEXT: lwz 3, -96(1)
 ; CHECK-NEXT: lwz 3, 0(3)
   store i32 %z, i32* @var3, align 4
-; CHECK-NEXT: lis 5, var3@ha
-; CHECK-NEXT: stw 3, var3@l(5)
+; CHECK-NEXT: lis 4, var3@ha
+; CHECK-NEXT: stw 3, var3@l(4)
+; CHECK-NEXT: lwz 3, -4(1)
   ret void
-; CHECK-NEXT: stw 6, -108(1)
+; CHECK-NEXT: stw 3, -100(1)
 ; CHECK-NEXT: blr 
 }
 
diff --git a/test/CodeGen/Thumb2/machine-licm.ll b/test/CodeGen/Thumb2/machine-licm.ll
index 46937fc..4e16c9a 100644
--- a/test/CodeGen/Thumb2/machine-licm.ll
+++ b/test/CodeGen/Thumb2/machine-licm.ll
@@ -51,12 +51,11 @@ return:                                           ; preds = %bb, %entry
 define void @t2(i8* %ptr1, i8* %ptr2) nounwind {
 entry:
 ; CHECK: t2:
-; CHECK: mov.w [[R3:r[0-9]+]], #1065353216
-; CHECK: vdup.32 q{{.*}}, [[R3]]
+; CHECK: vmov.f32 q{{.*}}, #1.000000e+00
   br i1 undef, label %bb1, label %bb2
 
 bb1:
-; CHECK-NEXT: %bb1
+; CHECK: %bb1
   %indvar = phi i32 [ %indvar.next, %bb1 ], [ 0, %entry ]
   %tmp1 = shl i32 %indvar, 2
   %gep1 = getelementptr i8* %ptr1, i32 %tmp1
diff --git a/test/CodeGen/X86/2011-11-22-AVX2-Domains.ll b/test/CodeGen/X86/2011-11-22-AVX2-Domains.ll
new file mode 100644
index 0000000..8174109
--- /dev/null
+++ b/test/CodeGen/X86/2011-11-22-AVX2-Domains.ll
@@ -0,0 +1,99 @@
+; RUN: llc < %s -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin11"
+
+; This test would create a vpand %ymm instruction that is only legal in AVX2.
+; CHECK-NOT: vpand %ymm
+
+declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
+
+define void @ShadeTile() nounwind {
+allocas:
+  br i1 undef, label %if_then, label %if_else
+
+if_then:                                          ; preds = %allocas
+  unreachable
+
+if_else:                                          ; preds = %allocas
+  br i1 undef, label %for_loop156.lr.ph, label %if_exit
+
+for_loop156.lr.ph:                                ; preds = %if_else
+  %val_6.i21244 = load i16* undef, align 2
+  %0 = insertelement <8 x i16> undef, i16 %val_6.i21244, i32 6
+  %val_7.i21248 = load i16* undef, align 2
+  %1 = insertelement <8 x i16> %0, i16 %val_7.i21248, i32 7
+  %uint2uint32.i20206 = zext <8 x i16> %1 to <8 x i32>
+  %bitop5.i20208 = and <8 x i32> %uint2uint32.i20206, <i32 31744, i32 31744, i32 31744, i32 31744, i32 31744, i32 31744, i32 31744, i32 31744>
+  %bitop8.i20209 = and <8 x i32> %uint2uint32.i20206, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
+  %bitop12.i20211 = lshr <8 x i32> %bitop5.i20208, <i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10, i32 10>
+  %binop13.i20212 = add <8 x i32> %bitop12.i20211, <i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112, i32 112>
+  %bitop15.i20213 = shl <8 x i32> %binop13.i20212, <i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23, i32 23>
+  %bitop17.i20214 = shl <8 x i32> %bitop8.i20209, <i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13, i32 13>
+  %bitop20.i20215 = or <8 x i32> undef, %bitop15.i20213
+  %bitop22.i20216 = or <8 x i32> %bitop20.i20215, %bitop17.i20214
+  %int_to_float_bitcast.i.i.i20217 = bitcast <8 x i32> %bitop22.i20216 to <8 x float>
+  %binop401 = fmul <8 x float> undef, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
+  %binop402 = fadd <8 x float> %binop401, <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+  %binop403 = fmul <8 x float> zeroinitializer, %binop402
+  %binop406 = fmul <8 x float> %int_to_float_bitcast.i.i.i20217, <float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00, float 4.000000e+00>
+  %binop407 = fadd <8 x float> %binop406, <float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00, float -2.000000e+00>
+  %binop408 = fmul <8 x float> zeroinitializer, %binop407
+  %binop411 = fsub <8 x float> <float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00, float 3.000000e+00>, undef
+  %val_4.i21290 = load i16* undef, align 2
+  %2 = insertelement <8 x i16> undef, i16 %val_4.i21290, i32 4
+  %val_5.i21294 = load i16* undef, align 2
+  %3 = insertelement <8 x i16> %2, i16 %val_5.i21294, i32 5
+  %val_6.i21298 = load i16* undef, align 2
+  %4 = insertelement <8 x i16> %3, i16 %val_6.i21298, i32 6
+  %ptr_7.i21301 = inttoptr i64 undef to i16*
+  %val_7.i21302 = load i16* %ptr_7.i21301, align 2
+  %5 = insertelement <8 x i16> %4, i16 %val_7.i21302, i32 7
+  %uint2uint32.i20218 = zext <8 x i16> %5 to <8 x i32>
+  %structelement561 = load i8** undef, align 8
+  %ptr2int563 = ptrtoint i8* %structelement561 to i64
+  %smear.ptr_smear7571 = insertelement <8 x i64> undef, i64 %ptr2int563, i32 7
+  %new_ptr582 = add <8 x i64> %smear.ptr_smear7571, zeroinitializer
+  %val_5.i21509 = load i8* null, align 1
+  %6 = insertelement <8 x i8> undef, i8 %val_5.i21509, i32 5
+  %7 = insertelement <8 x i8> %6, i8 undef, i32 6
+  %iptr_7.i21515 = extractelement <8 x i64> %new_ptr582, i32 7
+  %ptr_7.i21516 = inttoptr i64 %iptr_7.i21515 to i8*
+  %val_7.i21517 = load i8* %ptr_7.i21516, align 1
+  %8 = insertelement <8 x i8> %7, i8 %val_7.i21517, i32 7
+  %uint2float.i20245 = uitofp <8 x i8> %8 to <8 x float>
+  %binop.i20246 = fmul <8 x float> %uint2float.i20245, <float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000, float 0x3F70101020000000>
+  br i1 undef, label %for_loop594.lr.ph, label %for_exit595
+
+if_exit:                                          ; preds = %if_else
+  ret void
+
+for_loop594.lr.ph:                                ; preds = %for_loop156.lr.ph
+  %bitop8.i20221 = and <8 x i32> %uint2uint32.i20218, <i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023, i32 1023>
+  br i1 undef, label %cif_test_all730, label %cif_mask_mixed1552
+
+for_exit595:                                      ; preds = %for_loop156.lr.ph
+  unreachable
+
+cif_test_all730:                                  ; preds = %for_loop594.lr.ph
+  %binop11.i20545 = fmul <8 x float> %binop408, zeroinitializer
+  %binop12.i20546 = fadd <8 x float> undef, %binop11.i20545
+  %binop15.i20547 = fmul <8 x float> %binop411, undef
+  %binop16.i20548 = fadd <8 x float> %binop12.i20546, %binop15.i20547
+  %bincmp774 = fcmp ogt <8 x float> %binop16.i20548, zeroinitializer
+  %val_to_boolvec32775 = sext <8 x i1> %bincmp774 to <8 x i32>
+  %floatmask.i20549 = bitcast <8 x i32> %val_to_boolvec32775 to <8 x float>
+  %v.i20550 = tail call i32 @llvm.x86.avx.movmsk.ps.256(<8 x float> %floatmask.i20549) nounwind readnone
+  %cond = icmp eq i32 %v.i20550, 255
+  br i1 %cond, label %cif_test_all794, label %cif_test_mixed
+
+cif_test_all794:                                  ; preds = %cif_test_all730
+  %binop.i20572 = fmul <8 x float> %binop403, undef
+  unreachable
+
+cif_test_mixed:                                   ; preds = %cif_test_all730
+  %binop1207 = fmul <8 x float> %binop.i20246, undef
+  unreachable
+
+cif_mask_mixed1552:                               ; preds = %for_loop594.lr.ph
+  unreachable
+}
diff --git a/test/CodeGen/X86/avx-intrinsics-x86.ll b/test/CodeGen/X86/avx-intrinsics-x86.ll
index 3fa1d95..df12b71 100644
--- a/test/CodeGen/X86/avx-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx-intrinsics-x86.ll
@@ -2021,7 +2021,9 @@ declare <32 x i8> @llvm.x86.avx.ldu.dq.256(i8*) nounwind readonly
 
 define <32 x i8> @test_x86_avx_loadu_dq_256(i8* %a0) {
   ; CHECK: vmovdqu
-  %res = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
+  %a1 = call <32 x i8> @llvm.x86.avx.loadu.dq.256(i8* %a0) ; <<32 x i8>> [#uses=1]
+  ; add operation forces the execution domain.
+  %res = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
   ret <32 x i8> %res
 }
 declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
@@ -2029,7 +2031,9 @@ declare <32 x i8> @llvm.x86.avx.loadu.dq.256(i8*) nounwind readonly
 
 define <4 x double> @test_x86_avx_loadu_pd_256(i8* %a0) {
   ; CHECK: vmovupd
-  %res = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
+  %a1 = call <4 x double> @llvm.x86.avx.loadu.pd.256(i8* %a0) ; <<4 x double>> [#uses=1]
+  ; add operation forces the execution domain.
+  %res = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
   ret <4 x double> %res
 }
 declare <4 x double> @llvm.x86.avx.loadu.pd.256(i8*) nounwind readonly
@@ -2157,7 +2161,9 @@ declare i32 @llvm.x86.avx.movmsk.ps.256(<8 x float>) nounwind readnone
 
 define void @test_x86_avx_movnt_dq_256(i8* %a0, <4 x i64> %a1) {
   ; CHECK: vmovntdq
-  call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a1)
+  ; add operation forces the execution domain.
+  %a2 = add <4 x i64> %a1, <i64 1, i64 1, i64 1, i64 1>
+  call void @llvm.x86.avx.movnt.dq.256(i8* %a0, <4 x i64> %a2)
   ret void
 }
 declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
@@ -2165,7 +2171,8 @@ declare void @llvm.x86.avx.movnt.dq.256(i8*, <4 x i64>) nounwind
 
 define void @test_x86_avx_movnt_pd_256(i8* %a0, <4 x double> %a1) {
   ; CHECK: vmovntpd
-  call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a1)
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  call void @llvm.x86.avx.movnt.pd.256(i8* %a0, <4 x double> %a2)
   ret void
 }
 declare void @llvm.x86.avx.movnt.pd.256(i8*, <4 x double>) nounwind
@@ -2258,7 +2265,9 @@ declare <8 x float> @llvm.x86.avx.sqrt.ps.256(<8 x float>) nounwind readnone
 
 define void @test_x86_avx_storeu_dq_256(i8* %a0, <32 x i8> %a1) {
   ; CHECK: vmovdqu
-  call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a1)
+  ; add operation forces the execution domain.
+  %a2 = add <32 x i8> %a1, <i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1, i8 1>
+  call void @llvm.x86.avx.storeu.dq.256(i8* %a0, <32 x i8> %a2)
   ret void
 }
 declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
@@ -2266,7 +2275,9 @@ declare void @llvm.x86.avx.storeu.dq.256(i8*, <32 x i8>) nounwind
 
 define void @test_x86_avx_storeu_pd_256(i8* %a0, <4 x double> %a1) {
   ; CHECK: vmovupd
-  call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a1)
+  ; add operation forces the execution domain.
+  %a2 = fadd <4 x double> %a1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  call void @llvm.x86.avx.storeu.pd.256(i8* %a0, <4 x double> %a2)
   ret void
 }
 declare void @llvm.x86.avx.storeu.pd.256(i8*, <4 x double>) nounwind
diff --git a/test/CodeGen/X86/avx-logic.ll b/test/CodeGen/X86/avx-logic.ll
index cd37135..115cefb 100644
--- a/test/CodeGen/X86/avx-logic.ll
+++ b/test/CodeGen/X86/avx-logic.ll
@@ -7,7 +7,9 @@ entry:
   %1 = bitcast <4 x double> %y to <4 x i64>
   %and.i = and <4 x i64> %0, %1
   %2 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
 }
 
 ; CHECK: vandpd LCP{{.*}}(%rip)
@@ -16,7 +18,9 @@ entry:
   %0 = bitcast <4 x double> %y to <4 x i64>
   %and.i = and <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
   %1 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %1
+  ; add forces execution domain
+  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %2
 }
 
 ; CHECK: vandps
@@ -45,7 +49,9 @@ entry:
   %1 = bitcast <4 x double> %y to <4 x i64>
   %xor.i = xor <4 x i64> %0, %1
   %2 = bitcast <4 x i64> %xor.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
 }
 
 ; CHECK: vxorpd LCP{{.*}}(%rip)
@@ -54,7 +60,9 @@ entry:
   %0 = bitcast <4 x double> %y to <4 x i64>
   %xor.i = xor <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
   %1 = bitcast <4 x i64> %xor.i to <4 x double>
-  ret <4 x double> %1
+  ; add forces execution domain
+  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %2
 }
 
 ; CHECK: vxorps
@@ -83,7 +91,9 @@ entry:
   %1 = bitcast <4 x double> %y to <4 x i64>
   %or.i = or <4 x i64> %0, %1
   %2 = bitcast <4 x i64> %or.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
 }
 
 ; CHECK: vorpd LCP{{.*}}(%rip)
@@ -92,7 +102,9 @@ entry:
   %0 = bitcast <4 x double> %y to <4 x i64>
   %or.i = or <4 x i64> %0, <i64 4616752568008179712, i64 4614838538166547251, i64 4612361558371493478, i64 4608083138725491507>
   %1 = bitcast <4 x i64> %or.i to <4 x double>
-  ret <4 x double> %1
+  ; add forces execution domain
+  %2 = fadd <4 x double> %1, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %2
 }
 
 ; CHECK: vorps
@@ -122,7 +134,9 @@ entry:
   %1 = bitcast <4 x double> %y to <4 x i64>
   %and.i = and <4 x i64> %1, %neg.i
   %2 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
 }
 
 ; CHECK: vandnpd (%
@@ -134,7 +148,9 @@ entry:
   %1 = bitcast <4 x double> %tmp2 to <4 x i64>
   %and.i = and <4 x i64> %1, %neg.i
   %2 = bitcast <4 x i64> %and.i to <4 x double>
-  ret <4 x double> %2
+  ; add forces execution domain
+  %3 = fadd <4 x double> %2, <double 0x0, double 0x0, double 0x0, double 0x0>
+  ret <4 x double> %3
 }
 
 ; CHECK: vandnps
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index a33423d..681747b 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -112,3 +112,27 @@ define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
   ret <8 x i32> %bitop
 }
 
+;;; Uses shifts for sign extension
+; CHECK: _sext_v16i16
+; CHECK: vpsllw
+; CHECK: vpsraw
+; CHECK: vpsllw
+; CHECK: vpsraw
+; CHECK: vinsertf128
+define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
+  %b = trunc <16 x i16> %a to <16 x i8>
+  %c = sext <16 x i8> %b to <16 x i16>
+  ret <16 x i16> %c
+}
+
+; CHECK: _sext_v8i32
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK: vinsertf128
+define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
+  %b = trunc <8 x i32> %a to <8 x i16>
+  %c = sext <8 x i16> %b to <8 x i32>
+  ret <8 x i32> %c
+}
diff --git a/test/CodeGen/X86/avx-unpack.ll b/test/CodeGen/X86/avx-unpack.ll
index d420101..fcd7bb6 100644
--- a/test/CodeGen/X86/avx-unpack.ll
+++ b/test/CodeGen/X86/avx-unpack.ll
@@ -67,6 +67,15 @@ entry:
   ret <8 x i32> %shuffle.i
 }
 
+; CHECK: vunpckhps (%
+define <8 x i32> @unpackhips2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
+entry:
+  %a = load <8 x i32>* %src1
+  %b = load <8 x i32>* %src2
+  %shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i32> %shuffle.i
+}
+
 ; CHECK: vunpckhpd
 define <4 x i64> @unpackhipd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
 entry:
@@ -74,6 +83,15 @@ entry:
   ret <4 x i64> %shuffle.i
 }
 
+; CHECK: vunpckhpd (%
+define <4 x i64> @unpackhipd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
+entry:
+  %a = load <4 x i64>* %src1
+  %b = load <4 x i64>* %src2
+  %shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i64> %shuffle.i
+}
+
 ; CHECK: vunpcklps
 define <8 x i32> @unpacklops1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
 entry:
@@ -81,9 +99,27 @@ entry:
   ret <8 x i32> %shuffle.i
 }
 
+; CHECK: vunpcklps (%
+define <8 x i32> @unpacklops2(<8 x i32>* %src1, <8 x i32>* %src2) nounwind uwtable readnone ssp {
+entry:
+  %a = load <8 x i32>* %src1
+  %b = load <8 x i32>* %src2
+  %shuffle.i = shufflevector <8 x i32> %a, <8 x i32> %b, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x i32> %shuffle.i
+}
+
 ; CHECK: vunpcklpd
 define <4 x i64> @unpacklopd1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
 entry:
   %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
   ret <4 x i64> %shuffle.i
 }
+
+; CHECK: vunpcklpd (%
+define <4 x i64> @unpacklopd2(<4 x i64>* %src1, <4 x i64>* %src2) nounwind uwtable readnone ssp {
+entry:
+  %a = load <4 x i64>* %src1
+  %b = load <4 x i64>* %src2
+  %shuffle.i = shufflevector <4 x i64> %a, <4 x i64> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i64> %shuffle.i
+}
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 89b4188..8fbd02a 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -1,7 +1,4 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
-; XFAIL: *
-
-; xfail this file for now because of PR8156, when it gets solved merge this with avx-splat.ll
 
 ; CHECK: vbroadcastsd (%
 define <4 x i64> @A(i64* %ptr) nounwind uwtable readnone ssp {
diff --git a/test/CodeGen/X86/avx-vshufp.ll b/test/CodeGen/X86/avx-vshufp.ll
index f06548d..7ec3a44 100644
--- a/test/CodeGen/X86/avx-vshufp.ll
+++ b/test/CodeGen/X86/avx-vshufp.ll
@@ -27,3 +27,17 @@ entry:
   %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 5, i32 2, i32 undef>
   ret <4 x double> %shuffle
 }
+
+; CHECK: vshufps $-55, %ymm
+define <8 x float> @E(<8 x float> %a, <8 x float> %b) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <8 x float> %a, <8 x float> %b, <8 x i32> <i32 9, i32 10, i32 0, i32 3, i32 13, i32 14, i32 4, i32 7>
+  ret <8 x float> %shuffle
+}
+
+; CHECK: vshufpd  $8, %ymm
+define <4 x double> @F(<4 x double> %a, <4 x double> %b) nounwind uwtable readnone ssp {
+entry:
+  %shuffle = shufflevector <4 x double> %a, <4 x double> %b, <4 x i32> <i32 0, i32 4, i32 2, i32 7>
+  ret <4 x double> %shuffle
+}
diff --git a/test/CodeGen/X86/avx2-logic.ll b/test/CodeGen/X86/avx2-logic.ll
index f1c294c..13ebaa6 100644
--- a/test/CodeGen/X86/avx2-logic.ll
+++ b/test/CodeGen/X86/avx2-logic.ll
@@ -53,3 +53,44 @@ define <32 x i8> @vpblendvb(<32 x i8> %x, <32 x i8> %y) {
   %min = select <32 x i1> %min_is_x, <32 x i8> %x, <32 x i8> %y
   ret <32 x i8> %min
 }
+
+define <8 x i32> @signd(<8 x i32> %a, <8 x i32> %b) nounwind {
+entry:
+; CHECK: signd:
+; CHECK: psignd
+; CHECK-NOT: sub
+; CHECK: ret
+  %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %sub = sub nsw <8 x i32> zeroinitializer, %a
+  %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = and <8 x i32> %a, %0
+  %2 = and <8 x i32> %b.lobit, %sub
+  %cond = or <8 x i32> %1, %2
+  ret <8 x i32> %cond
+}
+
+define <8 x i32> @blendvb(<8 x i32> %b, <8 x i32> %a, <8 x i32> %c) nounwind {
+entry:
+; CHECK: blendvb:
+; CHECK: pblendvb
+; CHECK: ret
+  %b.lobit = ashr <8 x i32> %b, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
+  %sub = sub nsw <8 x i32> zeroinitializer, %a
+  %0 = xor <8 x i32> %b.lobit, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %1 = and <8 x i32> %c, %0
+  %2 = and <8 x i32> %a, %b.lobit
+  %cond = or <8 x i32> %1, %2
+  ret <8 x i32> %cond
+}
+
+define <8 x i32> @allOnes() nounwind {
+; CHECK: vpcmpeqd
+; CHECK-NOT: vinsert
+        ret <8 x i32> <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+}
+
+define <16 x i16> @allOnes2() nounwind {
+; CHECK: vpcmpeqd
+; CHECK-NOT: vinsert
+        ret <16 x i16> <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
+}
diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll
index f759361..b6cf54e 100644
--- a/test/CodeGen/X86/avx2-shift.ll
+++ b/test/CodeGen/X86/avx2-shift.ll
@@ -58,14 +58,14 @@ define <4 x i64> @variable_srl3(<4 x i64> %x, <4 x i64> %y) {
 }
 
 ; CHECK: variable_sra0
-; CHECK: psravd
+; CHECK: vpsravd
 ; CHECK: ret
 define <4 x i32> @variable_sra0(<4 x i32> %x, <4 x i32> %y) {
   %k = ashr <4 x i32> %x, %y
   ret <4 x i32> %k
 }
 ; CHECK: variable_sra1
-; CHECK: psravd
+; CHECK: vpsravd
 ; CHECK: ret
 define <8 x i32> @variable_sra1(<8 x i32> %x, <8 x i32> %y) {
   %k = ashr <8 x i32> %x, %y
@@ -127,7 +127,7 @@ define <16 x i16> @vshift07(<16 x i16> %a) nounwind readnone {
 }
 
 ; CHECK: variable_sra0_load
-; CHECK: psravd (%
+; CHECK: vpsravd (%
 ; CHECK: ret
 define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
   %y1 = load <4 x i32>* %y
@@ -136,7 +136,7 @@ define <4 x i32> @variable_sra0_load(<4 x i32> %x, <4 x i32>* %y) {
 }
 
 ; CHECK: variable_sra1_load
-; CHECK: psravd (%
+; CHECK: vpsravd (%
 ; CHECK: ret
 define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
   %y1 = load <8 x i32>* %y
@@ -145,7 +145,7 @@ define <8 x i32> @variable_sra1_load(<8 x i32> %x, <8 x i32>* %y) {
 }
 
 ; CHECK: variable_shl0_load
-; CHECK: psllvd (%
+; CHECK: vpsllvd (%
 ; CHECK: ret
 define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
   %y1 = load <4 x i32>* %y
@@ -153,7 +153,7 @@ define <4 x i32> @variable_shl0_load(<4 x i32> %x, <4 x i32>* %y) {
   ret <4 x i32> %k
 }
 ; CHECK: variable_shl1_load
-; CHECK: psllvd (%
+; CHECK: vpsllvd (%
 ; CHECK: ret
 define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
   %y1 = load <8 x i32>* %y
@@ -161,7 +161,7 @@ define <8 x i32> @variable_shl1_load(<8 x i32> %x, <8 x i32>* %y) {
   ret <8 x i32> %k
 }
 ; CHECK: variable_shl2_load
-; CHECK: psllvq (%
+; CHECK: vpsllvq (%
 ; CHECK: ret
 define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
   %y1 = load <2 x i64>* %y
@@ -169,7 +169,7 @@ define <2 x i64> @variable_shl2_load(<2 x i64> %x, <2 x i64>* %y) {
   ret <2 x i64> %k
 }
 ; CHECK: variable_shl3_load
-; CHECK: psllvq (%
+; CHECK: vpsllvq (%
 ; CHECK: ret
 define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
   %y1 = load <4 x i64>* %y
@@ -177,7 +177,7 @@ define <4 x i64> @variable_shl3_load(<4 x i64> %x, <4 x i64>* %y) {
   ret <4 x i64> %k
 }
 ; CHECK: variable_srl0_load
-; CHECK: psrlvd (%
+; CHECK: vpsrlvd (%
 ; CHECK: ret
 define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
   %y1 = load <4 x i32>* %y
@@ -185,7 +185,7 @@ define <4 x i32> @variable_srl0_load(<4 x i32> %x, <4 x i32>* %y) {
   ret <4 x i32> %k
 }
 ; CHECK: variable_srl1_load
-; CHECK: psrlvd (%
+; CHECK: vpsrlvd (%
 ; CHECK: ret
 define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
   %y1 = load <8 x i32>* %y
@@ -193,7 +193,7 @@ define <8 x i32> @variable_srl1_load(<8 x i32> %x, <8 x i32>* %y) {
   ret <8 x i32> %k
 }
 ; CHECK: variable_srl2_load
-; CHECK: psrlvq (%
+; CHECK: vpsrlvq (%
 ; CHECK: ret
 define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
   %y1 = load <2 x i64>* %y
@@ -201,10 +201,68 @@ define <2 x i64> @variable_srl2_load(<2 x i64> %x, <2 x i64>* %y) {
   ret <2 x i64> %k
 }
 ; CHECK: variable_srl3_load
-; CHECK: psrlvq (%
+; CHECK: vpsrlvq (%
 ; CHECK: ret
 define <4 x i64> @variable_srl3_load(<4 x i64> %x, <4 x i64>* %y) {
   %y1 = load <4 x i64>* %y
   %k = lshr <4 x i64> %x, %y1
   ret <4 x i64> %k
 }
+
+define <32 x i8> @shl9(<32 x i8> %A) nounwind {
+  %B = shl <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <32 x i8> %B
+; CHECK: shl9:
+; CHECK: vpsllw $3
+; CHECK: vpand
+; CHECK: ret
+}
+
+define <32 x i8> @shr9(<32 x i8> %A) nounwind {
+  %B = lshr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <32 x i8> %B
+; CHECK: shr9:
+; CHECK: vpsrlw $3
+; CHECK: vpand
+; CHECK: ret
+}
+
+define <32 x i8> @sra_v32i8_7(<32 x i8> %A) nounwind {
+  %B = ashr <32 x i8> %A, <i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7>
+  ret <32 x i8> %B
+; CHECK: sra_v32i8_7:
+; CHECK: vxorps
+; CHECK: vpcmpgtb
+; CHECK: ret
+}
+
+define <32 x i8> @sra_v32i8(<32 x i8> %A) nounwind {
+  %B = ashr <32 x i8> %A, <i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3, i8 3>
+  ret <32 x i8> %B
+; CHECK: sra_v32i8:
+; CHECK: vpsrlw $3
+; CHECK: vpand
+; CHECK: vpxor
+; CHECK: vpsubb
+; CHECK: ret
+}
+
+; CHECK: _sext_v16i16
+; CHECK: vpsllw
+; CHECK: vpsraw
+; CHECK-NOT: vinsertf128
+define <16 x i16> @sext_v16i16(<16 x i16> %a) nounwind {
+  %b = trunc <16 x i16> %a to <16 x i8>
+  %c = sext <16 x i8> %b to <16 x i16>
+  ret <16 x i16> %c
+}
+
+; CHECK: _sext_v8i32
+; CHECK: vpslld
+; CHECK: vpsrad
+; CHECK-NOT: vinsertf128
+define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
+  %b = trunc <8 x i32> %a to <8 x i16>
+  %c = sext <8 x i16> %b to <8 x i32>
+  ret <8 x i32> %c
+}
diff --git a/test/CodeGen/X86/avx2-unpack.ll b/test/CodeGen/X86/avx2-unpack.ll
new file mode 100644
index 0000000..aa97308
--- /dev/null
+++ b/test/CodeGen/X86/avx2-unpack.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; CHECK: vpunpckhdq
+define <8 x i32> @unpackhidq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 2, i32 10, i32 3, i32 11, i32 6, i32 14, i32 7, i32 15>
+  ret <8 x i32> %shuffle.i
+}
+
+; CHECK: vpunpckhqdq
+define <4 x i64> @unpackhiqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 1, i32 5, i32 3, i32 7>
+  ret <4 x i64> %shuffle.i
+}
+
+; CHECK: vpunpckldq
+define <8 x i32> @unpacklodq1(<8 x i32> %src1, <8 x i32> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <8 x i32> %src1, <8 x i32> %src2, <8 x i32> <i32 0, i32 8, i32 1, i32 9, i32 4, i32 12, i32 5, i32 13>
+  ret <8 x i32> %shuffle.i
+}
+
+; CHECK: vpunpcklqdq
+define <4 x i64> @unpacklqdq1(<4 x i64> %src1, <4 x i64> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <4 x i64> %src1, <4 x i64> %src2, <4 x i32> <i32 0, i32 4, i32 2, i32 6>
+  ret <4 x i64> %shuffle.i
+}
+
+; CHECK: vpunpckhwd
+define <16 x i16> @unpackhwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 4, i32 20, i32 5, i32 21, i32 6, i32 22, i32 7, i32 23, i32 12, i32 28, i32 13, i32 29, i32 14, i32 30, i32 15, i32 31>
+  ret <16 x i16> %shuffle.i
+}
+
+; CHECK: vpunpcklwd
+define <16 x i16> @unpacklwd(<16 x i16> %src1, <16 x i16> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src2, <16 x i32> <i32 0, i32 16, i32 1, i32 17, i32 2, i32 18, i32 3, i32 19, i32 8, i32 24, i32 9, i32 25, i32 10, i32 26, i32 11, i32 27>
+  ret <16 x i16> %shuffle.i
+}
+
+; CHECK: vpunpckhbw
+define <32 x i8> @unpackhbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 8, i32 40, i32 9, i32 41, i32 10, i32 42, i32 11, i32 43, i32 12, i32 44, i32 13, i32 45, i32 14, i32 46, i32 15, i32 47, i32 24, i32 56, i32 25, i32 57, i32 26, i32 58, i32 27, i32 59, i32 28, i32 60, i32 29, i32 61, i32 30, i32 62, i32 31, i32 63>
+  ret <32 x i8> %shuffle.i
+}
+
+; CHECK: vpunpcklbw
+define <32 x i8> @unpacklbw(<32 x i8> %src1, <32 x i8> %src2) nounwind uwtable readnone ssp {
+entry:
+  %shuffle.i = shufflevector <32 x i8> %src1, <32 x i8> %src2, <32 x i32> <i32 0, i32 32, i32 1, i32 33, i32 2, i32 34, i32 3, i32 35, i32 4, i32 36, i32 5, i32 37, i32 6, i32 38, i32 7, i32 39, i32 16, i32 48, i32 17, i32 49, i32 18, i32 50, i32 19, i32 51, i32 20, i32 52, i32 21, i32 53, i32 22, i32 54, i32 23, i32 55>
+  ret <32 x i8> %shuffle.i
+}
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
new file mode 100644
index 0000000..142be33
--- /dev/null
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -0,0 +1,142 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=+avx2 | FileCheck %s
+
+; CHECK: vpbroadcastb (%
+define <16 x i8> @BB16(i8* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i8* %ptr, align 4
+  %q0 = insertelement <16 x i8> undef, i8 %q, i32 0
+  %q1 = insertelement <16 x i8> %q0, i8 %q, i32 1
+  %q2 = insertelement <16 x i8> %q1, i8 %q, i32 2
+  %q3 = insertelement <16 x i8> %q2, i8 %q, i32 3
+  %q4 = insertelement <16 x i8> %q3, i8 %q, i32 4
+  %q5 = insertelement <16 x i8> %q4, i8 %q, i32 5
+  %q6 = insertelement <16 x i8> %q5, i8 %q, i32 6
+  %q7 = insertelement <16 x i8> %q6, i8 %q, i32 7
+  %q8 = insertelement <16 x i8> %q7, i8 %q, i32 8
+  %q9 = insertelement <16 x i8> %q8, i8 %q, i32 9
+  %qa = insertelement <16 x i8> %q9, i8 %q, i32 10
+  %qb = insertelement <16 x i8> %qa, i8 %q, i32 11
+  %qc = insertelement <16 x i8> %qb, i8 %q, i32 12
+  %qd = insertelement <16 x i8> %qc, i8 %q, i32 13
+  %qe = insertelement <16 x i8> %qd, i8 %q, i32 14
+  %qf = insertelement <16 x i8> %qe, i8 %q, i32 15
+  ret <16 x i8> %qf
+}
+; CHECK: vpbroadcastb (%
+define <32 x i8> @BB32(i8* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i8* %ptr, align 4
+  %q0 = insertelement <32 x i8> undef, i8 %q, i32 0
+  %q1 = insertelement <32 x i8> %q0, i8 %q, i32 1
+  %q2 = insertelement <32 x i8> %q1, i8 %q, i32 2
+  %q3 = insertelement <32 x i8> %q2, i8 %q, i32 3
+  %q4 = insertelement <32 x i8> %q3, i8 %q, i32 4
+  %q5 = insertelement <32 x i8> %q4, i8 %q, i32 5
+  %q6 = insertelement <32 x i8> %q5, i8 %q, i32 6
+  %q7 = insertelement <32 x i8> %q6, i8 %q, i32 7
+  %q8 = insertelement <32 x i8> %q7, i8 %q, i32 8
+  %q9 = insertelement <32 x i8> %q8, i8 %q, i32 9
+  %qa = insertelement <32 x i8> %q9, i8 %q, i32 10
+  %qb = insertelement <32 x i8> %qa, i8 %q, i32 11
+  %qc = insertelement <32 x i8> %qb, i8 %q, i32 12
+  %qd = insertelement <32 x i8> %qc, i8 %q, i32 13
+  %qe = insertelement <32 x i8> %qd, i8 %q, i32 14
+  %qf = insertelement <32 x i8> %qe, i8 %q, i32 15
+
+  %q20 = insertelement <32 x i8> %qf, i8 %q,  i32 16
+  %q21 = insertelement <32 x i8> %q20, i8 %q, i32 17
+  %q22 = insertelement <32 x i8> %q21, i8 %q, i32 18
+  %q23 = insertelement <32 x i8> %q22, i8 %q, i32 19
+  %q24 = insertelement <32 x i8> %q23, i8 %q, i32 20
+  %q25 = insertelement <32 x i8> %q24, i8 %q, i32 21
+  %q26 = insertelement <32 x i8> %q25, i8 %q, i32 22
+  %q27 = insertelement <32 x i8> %q26, i8 %q, i32 23
+  %q28 = insertelement <32 x i8> %q27, i8 %q, i32 24
+  %q29 = insertelement <32 x i8> %q28, i8 %q, i32 25
+  %q2a = insertelement <32 x i8> %q29, i8 %q, i32 26
+  %q2b = insertelement <32 x i8> %q2a, i8 %q, i32 27
+  %q2c = insertelement <32 x i8> %q2b, i8 %q, i32 28
+  %q2d = insertelement <32 x i8> %q2c, i8 %q, i32 29
+  %q2e = insertelement <32 x i8> %q2d, i8 %q, i32 30
+  %q2f = insertelement <32 x i8> %q2e, i8 %q, i32 31
+  ret <32 x i8> %q2f
+}
+; CHECK: vpbroadcastw (%
+
+define <8 x i16> @W16(i16* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i16* %ptr, align 4
+  %q0 = insertelement <8 x i16> undef, i16 %q, i32 0
+  %q1 = insertelement <8 x i16> %q0, i16 %q, i32 1
+  %q2 = insertelement <8 x i16> %q1, i16 %q, i32 2
+  %q3 = insertelement <8 x i16> %q2, i16 %q, i32 3
+  %q4 = insertelement <8 x i16> %q3, i16 %q, i32 4
+  %q5 = insertelement <8 x i16> %q4, i16 %q, i32 5
+  %q6 = insertelement <8 x i16> %q5, i16 %q, i32 6
+  %q7 = insertelement <8 x i16> %q6, i16 %q, i32 7
+  ret <8 x i16> %q7
+}
+; CHECK: vpbroadcastw (%
+define <16 x i16> @WW16(i16* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i16* %ptr, align 4
+  %q0 = insertelement <16 x i16> undef, i16 %q, i32 0
+  %q1 = insertelement <16 x i16> %q0, i16 %q, i32 1
+  %q2 = insertelement <16 x i16> %q1, i16 %q, i32 2
+  %q3 = insertelement <16 x i16> %q2, i16 %q, i32 3
+  %q4 = insertelement <16 x i16> %q3, i16 %q, i32 4
+  %q5 = insertelement <16 x i16> %q4, i16 %q, i32 5
+  %q6 = insertelement <16 x i16> %q5, i16 %q, i32 6
+  %q7 = insertelement <16 x i16> %q6, i16 %q, i32 7
+  %q8 = insertelement <16 x i16> %q7, i16 %q, i32 8
+  %q9 = insertelement <16 x i16> %q8, i16 %q, i32 9
+  %qa = insertelement <16 x i16> %q9, i16 %q, i32 10
+  %qb = insertelement <16 x i16> %qa, i16 %q, i32 11
+  %qc = insertelement <16 x i16> %qb, i16 %q, i32 12
+  %qd = insertelement <16 x i16> %qc, i16 %q, i32 13
+  %qe = insertelement <16 x i16> %qd, i16 %q, i32 14
+  %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
+  ret <16 x i16> %qf
+}
+; CHECK: vpbroadcastd (%
+define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i32* %ptr, align 4
+  %q0 = insertelement <4 x i32> undef, i32 %q, i32 0
+  %q1 = insertelement <4 x i32> %q0, i32 %q, i32 1
+  %q2 = insertelement <4 x i32> %q1, i32 %q, i32 2
+  %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
+  ret <4 x i32> %q3
+}
+; CHECK: vpbroadcastd (%
+define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i32* %ptr, align 4
+  %q0 = insertelement <8 x i32> undef, i32 %q, i32 0
+  %q1 = insertelement <8 x i32> %q0, i32 %q, i32 1
+  %q2 = insertelement <8 x i32> %q1, i32 %q, i32 2
+  %q3 = insertelement <8 x i32> %q2, i32 %q, i32 3
+  %q4 = insertelement <8 x i32> %q3, i32 %q, i32 4
+  %q5 = insertelement <8 x i32> %q4, i32 %q, i32 5
+  %q6 = insertelement <8 x i32> %q5, i32 %q, i32 6
+  %q7 = insertelement <8 x i32> %q6, i32 %q, i32 7
+  ret <8 x i32> %q7
+}
+; CHECK: vpbroadcastq (%
+define <2 x i64> @Q64(i64* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i64* %ptr, align 4
+  %q0 = insertelement <2 x i64> undef, i64 %q, i32 0
+  %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
+  ret <2 x i64> %q1
+}
+; CHECK: vpbroadcastq (%
+define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
+entry:
+  %q = load i64* %ptr, align 4
+  %q0 = insertelement <4 x i64> undef, i64 %q, i32 0
+  %q1 = insertelement <4 x i64> %q0, i64 %q, i32 1
+  %q2 = insertelement <4 x i64> %q1, i64 %q, i32 2
+  %q3 = insertelement <4 x i64> %q2, i64 %q, i32 3
+  ret <4 x i64> %q3
+}
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index e41d52c..f87d1a6 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86 -enable-block-placement < %s | FileCheck %s
+; RUN: llc -mtriple=i686-linux -enable-block-placement < %s | FileCheck %s
 
 declare void @error(i32 %i, i32 %a, i32 %b)
 
@@ -241,8 +241,8 @@ define void @unnatural_cfg1() {
 ; CHECK: unnatural_cfg1
 ; CHECK: %entry
 ; CHECK: %loop.body1
-; CHECK: %loop.body3
 ; CHECK: %loop.body2
+; CHECK: %loop.body3
 
 entry:
   br label %loop.header
@@ -272,6 +272,77 @@ loop.body5:
   br label %loop.body3
 }
 
+define void @unnatural_cfg2() {
+; Test that we can handle a loop with a nested natural loop *and* an unnatural
+; loop. This was reduced from a crash on block placement when run over
+; single-source GCC.
+; CHECK: unnatural_cfg2
+; CHECK: %entry
+; CHECK: %loop.header
+; CHECK: %loop.body1
+; CHECK: %loop.body2
+; CHECK: %loop.body3
+; CHECK: %loop.inner1.begin
+; The end block is folded with %loop.body3...
+; CHECK-NOT: %loop.inner1.end
+; CHECK: %loop.body4
+; CHECK: %loop.inner2.begin
+; The loop.inner2.end block is folded
+; CHECK: %bail
+
+entry:
+  br label %loop.header
+
+loop.header:
+  %comp0 = icmp eq i32* undef, null
+  br i1 %comp0, label %bail, label %loop.body1
+
+loop.body1:
+  %val0 = load i32** undef, align 4
+  br i1 undef, label %loop.body2, label %loop.inner1.begin
+
+loop.body2:
+  br i1 undef, label %loop.body4, label %loop.body3
+
+loop.body3:
+  %ptr1 = getelementptr inbounds i32* %val0, i32 0
+  %castptr1 = bitcast i32* %ptr1 to i32**
+  %val1 = load i32** %castptr1, align 4
+  br label %loop.inner1.begin
+
+loop.inner1.begin:
+  %valphi = phi i32* [ %val2, %loop.inner1.end ], [ %val1, %loop.body3 ], [ %val0, %loop.body1 ]
+  %castval = bitcast i32* %valphi to i32*
+  %comp1 = icmp eq i32 undef, 48
+  br i1 %comp1, label %loop.inner1.end, label %loop.body4
+
+loop.inner1.end:
+  %ptr2 = getelementptr inbounds i32* %valphi, i32 0
+  %castptr2 = bitcast i32* %ptr2 to i32**
+  %val2 = load i32** %castptr2, align 4
+  br label %loop.inner1.begin
+
+loop.body4.dead:
+  br label %loop.body4
+
+loop.body4:
+  %comp2 = icmp ult i32 undef, 3
+  br i1 %comp2, label %loop.inner2.begin, label %loop.end
+
+loop.inner2.begin:
+  br i1 false, label %loop.end, label %loop.inner2.end
+
+loop.inner2.end:
+  %comp3 = icmp eq i32 undef, 1769472
+  br i1 %comp3, label %loop.end, label %loop.inner2.begin
+
+loop.end:
+  br label %loop.header
+
+bail:
+  unreachable
+}
+
 define i32 @problematic_switch() {
 ; This function's CFG caused overlow in the machine branch probability
 ; calculation, triggering asserts. Make sure we don't crash on it.
@@ -322,3 +393,470 @@ exit:
   %merge = phi i32 [ 3, %step ], [ 6, %entry ]
   ret i32 %merge
 }
+
+define void @fpcmp_unanalyzable_branch(i1 %cond) {
+; This function's CFG contains an unanalyzable branch that is likely to be
+; split due to having a different high-probability predecessor.
+; CHECK: fpcmp_unanalyzable_branch
+; CHECK: %entry
+; CHECK: %exit
+; CHECK-NOT: %if.then
+; CHECK-NOT: %if.end
+; CHECK-NOT: jne
+; CHECK-NOT: jnp
+; CHECK: jne
+; CHECK-NEXT: jnp
+; CHECK-NEXT: %if.then
+
+entry:
+; Note that this branch must be strongly biased toward
+; 'entry.if.then_crit_edge' to ensure that we would try to form a chain for
+; 'entry' -> 'entry.if.then_crit_edge' -> 'if.then'. It is the last edge in that
+; chain which would violate the unanalyzable branch in 'exit', but we won't even
+; try this trick unless 'if.then' is believed to almost always be reached from
+; 'entry.if.then_crit_edge'.
+  br i1 %cond, label %entry.if.then_crit_edge, label %lor.lhs.false, !prof !1
+
+entry.if.then_crit_edge:
+  %.pre14 = load i8* undef, align 1, !tbaa !0
+  br label %if.then
+
+lor.lhs.false:
+  br i1 undef, label %if.end, label %exit
+
+exit:
+  %cmp.i = fcmp une double 0.000000e+00, undef
+  br i1 %cmp.i, label %if.then, label %if.end
+
+if.then:
+  %0 = phi i8 [ %.pre14, %entry.if.then_crit_edge ], [ undef, %exit ]
+  %1 = and i8 %0, 1
+  store i8 %1, i8* undef, align 4, !tbaa !0
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+!1 = metadata !{metadata !"branch_weights", i32 1000, i32 1}
+
+declare i32 @f()
+declare i32 @g()
+declare i32 @h(i32 %x)
+
+define i32 @test_global_cfg_break_profitability() {
+; Check that our metrics for the profitability of a CFG break are global rather
+; than local. A successor may be very hot, but if the current block isn't, it
+; doesn't matter. Within this test the 'then' block is slightly warmer than the
+; 'else' block, but not nearly enough to merit merging it with the exit block
+; even though the probability of 'then' branching to the 'exit' block is very
+; high.
+; CHECK: test_global_cfg_break_profitability
+; CHECK: calll {{_?}}f
+; CHECK: calll {{_?}}g
+; CHECK: calll {{_?}}h
+; CHECK: ret
+
+entry:
+  br i1 undef, label %then, label %else, !prof !2
+
+then:
+  %then.result = call i32 @f()
+  br label %exit
+
+else:
+  %else.result = call i32 @g()
+  br label %exit
+
+exit:
+  %result = phi i32 [ %then.result, %then ], [ %else.result, %else ]
+  %result2 = call i32 @h(i32 %result)
+  ret i32 %result
+}
+
+!2 = metadata !{metadata !"branch_weights", i32 3, i32 1}
+
+declare i32 @__gxx_personality_v0(...)
+
+define void @test_eh_lpad_successor() {
+; Some times the landing pad ends up as the first successor of an invoke block.
+; When this happens, a strange result used to fall out of updateTerminators: we
+; didn't correctly locate the fallthrough successor, assuming blindly that the
+; first one was the fallthrough successor. As a result, we would add an
+; erroneous jump to the landing pad thinking *that* was the default successor.
+; CHECK: test_eh_lpad_successor
+; CHECK: %entry
+; CHECK-NOT: jmp
+; CHECK: %loop
+
+entry:
+  invoke i32 @f() to label %preheader unwind label %lpad
+
+preheader:
+  br label %loop
+
+lpad:
+  %lpad.val = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  resume { i8*, i32 } %lpad.val
+
+loop:
+  br label %loop
+}
+
+declare void @fake_throw() noreturn
+
+define void @test_eh_throw() {
+; For blocks containing a 'throw' (or similar functionality), we have
+; a no-return invoke. In this case, only EH successors will exist, and
+; fallthrough simply won't occur. Make sure we don't crash trying to update
+; terminators for such constructs.
+;
+; CHECK: test_eh_throw
+; CHECK: %entry
+; CHECK: %cleanup
+
+entry:
+  invoke void @fake_throw() to label %continue unwind label %cleanup
+
+continue:
+  unreachable
+
+cleanup:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          cleanup
+  unreachable
+}
+
+define void @test_unnatural_cfg_backwards_inner_loop() {
+; Test that when we encounter an unnatural CFG structure after having formed
+; a chain for an inner loop which happened to be laid out backwards we don't
+; attempt to merge onto the wrong end of the inner loop just because we find it
+; first. This was reduced from a crasher in GCC's single source.
+;
+; CHECK: test_unnatural_cfg_backwards_inner_loop
+; CHECK: %entry
+; CHECK: %body
+; CHECK: %loop1
+; CHECK: %loop2b
+; CHECK: %loop2a
+
+entry:
+  br i1 undef, label %loop2a, label %body
+
+body:
+  br label %loop2a
+
+loop1:
+  %next.load = load i32** undef
+  br i1 %comp.a, label %loop2a, label %loop2b
+
+loop2a:
+  %var = phi i32* [ null, %entry ], [ null, %body ], [ %next.phi, %loop1 ]
+  %next.var = phi i32* [ null, %entry ], [ undef, %body ], [ %next.load, %loop1 ]
+  %comp.a = icmp eq i32* %var, null
+  br label %loop3
+
+loop2b:
+  %gep = getelementptr inbounds i32* %var.phi, i32 0
+  %next.ptr = bitcast i32* %gep to i32**
+  store i32* %next.phi, i32** %next.ptr
+  br label %loop3
+
+loop3:
+  %var.phi = phi i32* [ %next.phi, %loop2b ], [ %var, %loop2a ]
+  %next.phi = phi i32* [ %next.load, %loop2b ], [ %next.var, %loop2a ]
+  br label %loop1
+}
+
+define void @unanalyzable_branch_to_loop_header() {
+; Ensure that we can handle unanalyzable branches into loop headers. We
+; pre-form chains for unanalyzable branches, and will find the tail end of that
+; at the start of the loop. This function uses floating point comparison
+; fallthrough because that happens to always produce unanalyzable branches on
+; x86.
+;
+; CHECK: unanalyzable_branch_to_loop_header
+; CHECK: %entry
+; CHECK: %loop
+; CHECK: %exit
+
+entry:
+  %cmp = fcmp une double 0.000000e+00, undef
+  br i1 %cmp, label %loop, label %exit
+
+loop:
+  %cond = icmp eq i8 undef, 42
+  br i1 %cond, label %exit, label %loop
+
+exit:
+  ret void
+}
+
+define void @unanalyzable_branch_to_best_succ(i1 %cond) {
+; Ensure that we can handle unanalyzable branches where the destination block
+; gets selected as the optimal sucessor to merge.
+;
+; CHECK: unanalyzable_branch_to_best_succ
+; CHECK: %entry
+; CHECK: %foo
+; CHECK: %bar
+; CHECK: %exit
+
+entry:
+  ; Bias this branch toward bar to ensure we form that chain.
+  br i1 %cond, label %bar, label %foo, !prof !1
+
+foo:
+  %cmp = fcmp une double 0.000000e+00, undef
+  br i1 %cmp, label %bar, label %exit
+
+bar:
+  call i32 @f()
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @unanalyzable_branch_to_free_block(float %x) {
+; Ensure that we can handle unanalyzable branches where the destination block
+; gets selected as the best free block in the CFG.
+;
+; CHECK: unanalyzable_branch_to_free_block
+; CHECK: %entry
+; CHECK: %a
+; CHECK: %b
+; CHECK: %c
+; CHECK: %exit
+
+entry:
+  br i1 undef, label %a, label %b
+
+a:
+  call i32 @f()
+  br label %c
+
+b:
+  %cmp = fcmp une float %x, undef
+  br i1 %cmp, label %c, label %exit
+
+c:
+  call i32 @g()
+  br label %exit
+
+exit:
+  ret void
+}
+
+define void @many_unanalyzable_branches() {
+; Ensure that we don't crash as we're building up many unanalyzable branches,
+; blocks, and loops.
+;
+; CHECK: many_unanalyzable_branches
+; CHECK: %entry
+; CHECK: %exit
+
+entry:
+  br label %0
+
+  %val0 = volatile load float* undef
+  %cmp0 = fcmp une float %val0, undef
+  br i1 %cmp0, label %1, label %0
+  %val1 = volatile load float* undef
+  %cmp1 = fcmp une float %val1, undef
+  br i1 %cmp1, label %2, label %1
+  %val2 = volatile load float* undef
+  %cmp2 = fcmp une float %val2, undef
+  br i1 %cmp2, label %3, label %2
+  %val3 = volatile load float* undef
+  %cmp3 = fcmp une float %val3, undef
+  br i1 %cmp3, label %4, label %3
+  %val4 = volatile load float* undef
+  %cmp4 = fcmp une float %val4, undef
+  br i1 %cmp4, label %5, label %4
+  %val5 = volatile load float* undef
+  %cmp5 = fcmp une float %val5, undef
+  br i1 %cmp5, label %6, label %5
+  %val6 = volatile load float* undef
+  %cmp6 = fcmp une float %val6, undef
+  br i1 %cmp6, label %7, label %6
+  %val7 = volatile load float* undef
+  %cmp7 = fcmp une float %val7, undef
+  br i1 %cmp7, label %8, label %7
+  %val8 = volatile load float* undef
+  %cmp8 = fcmp une float %val8, undef
+  br i1 %cmp8, label %9, label %8
+  %val9 = volatile load float* undef
+  %cmp9 = fcmp une float %val9, undef
+  br i1 %cmp9, label %10, label %9
+  %val10 = volatile load float* undef
+  %cmp10 = fcmp une float %val10, undef
+  br i1 %cmp10, label %11, label %10
+  %val11 = volatile load float* undef
+  %cmp11 = fcmp une float %val11, undef
+  br i1 %cmp11, label %12, label %11
+  %val12 = volatile load float* undef
+  %cmp12 = fcmp une float %val12, undef
+  br i1 %cmp12, label %13, label %12
+  %val13 = volatile load float* undef
+  %cmp13 = fcmp une float %val13, undef
+  br i1 %cmp13, label %14, label %13
+  %val14 = volatile load float* undef
+  %cmp14 = fcmp une float %val14, undef
+  br i1 %cmp14, label %15, label %14
+  %val15 = volatile load float* undef
+  %cmp15 = fcmp une float %val15, undef
+  br i1 %cmp15, label %16, label %15
+  %val16 = volatile load float* undef
+  %cmp16 = fcmp une float %val16, undef
+  br i1 %cmp16, label %17, label %16
+  %val17 = volatile load float* undef
+  %cmp17 = fcmp une float %val17, undef
+  br i1 %cmp17, label %18, label %17
+  %val18 = volatile load float* undef
+  %cmp18 = fcmp une float %val18, undef
+  br i1 %cmp18, label %19, label %18
+  %val19 = volatile load float* undef
+  %cmp19 = fcmp une float %val19, undef
+  br i1 %cmp19, label %20, label %19
+  %val20 = volatile load float* undef
+  %cmp20 = fcmp une float %val20, undef
+  br i1 %cmp20, label %21, label %20
+  %val21 = volatile load float* undef
+  %cmp21 = fcmp une float %val21, undef
+  br i1 %cmp21, label %22, label %21
+  %val22 = volatile load float* undef
+  %cmp22 = fcmp une float %val22, undef
+  br i1 %cmp22, label %23, label %22
+  %val23 = volatile load float* undef
+  %cmp23 = fcmp une float %val23, undef
+  br i1 %cmp23, label %24, label %23
+  %val24 = volatile load float* undef
+  %cmp24 = fcmp une float %val24, undef
+  br i1 %cmp24, label %25, label %24
+  %val25 = volatile load float* undef
+  %cmp25 = fcmp une float %val25, undef
+  br i1 %cmp25, label %26, label %25
+  %val26 = volatile load float* undef
+  %cmp26 = fcmp une float %val26, undef
+  br i1 %cmp26, label %27, label %26
+  %val27 = volatile load float* undef
+  %cmp27 = fcmp une float %val27, undef
+  br i1 %cmp27, label %28, label %27
+  %val28 = volatile load float* undef
+  %cmp28 = fcmp une float %val28, undef
+  br i1 %cmp28, label %29, label %28
+  %val29 = volatile load float* undef
+  %cmp29 = fcmp une float %val29, undef
+  br i1 %cmp29, label %30, label %29
+  %val30 = volatile load float* undef
+  %cmp30 = fcmp une float %val30, undef
+  br i1 %cmp30, label %31, label %30
+  %val31 = volatile load float* undef
+  %cmp31 = fcmp une float %val31, undef
+  br i1 %cmp31, label %32, label %31
+  %val32 = volatile load float* undef
+  %cmp32 = fcmp une float %val32, undef
+  br i1 %cmp32, label %33, label %32
+  %val33 = volatile load float* undef
+  %cmp33 = fcmp une float %val33, undef
+  br i1 %cmp33, label %34, label %33
+  %val34 = volatile load float* undef
+  %cmp34 = fcmp une float %val34, undef
+  br i1 %cmp34, label %35, label %34
+  %val35 = volatile load float* undef
+  %cmp35 = fcmp une float %val35, undef
+  br i1 %cmp35, label %36, label %35
+  %val36 = volatile load float* undef
+  %cmp36 = fcmp une float %val36, undef
+  br i1 %cmp36, label %37, label %36
+  %val37 = volatile load float* undef
+  %cmp37 = fcmp une float %val37, undef
+  br i1 %cmp37, label %38, label %37
+  %val38 = volatile load float* undef
+  %cmp38 = fcmp une float %val38, undef
+  br i1 %cmp38, label %39, label %38
+  %val39 = volatile load float* undef
+  %cmp39 = fcmp une float %val39, undef
+  br i1 %cmp39, label %40, label %39
+  %val40 = volatile load float* undef
+  %cmp40 = fcmp une float %val40, undef
+  br i1 %cmp40, label %41, label %40
+  %val41 = volatile load float* undef
+  %cmp41 = fcmp une float %val41, undef
+  br i1 %cmp41, label %42, label %41
+  %val42 = volatile load float* undef
+  %cmp42 = fcmp une float %val42, undef
+  br i1 %cmp42, label %43, label %42
+  %val43 = volatile load float* undef
+  %cmp43 = fcmp une float %val43, undef
+  br i1 %cmp43, label %44, label %43
+  %val44 = volatile load float* undef
+  %cmp44 = fcmp une float %val44, undef
+  br i1 %cmp44, label %45, label %44
+  %val45 = volatile load float* undef
+  %cmp45 = fcmp une float %val45, undef
+  br i1 %cmp45, label %46, label %45
+  %val46 = volatile load float* undef
+  %cmp46 = fcmp une float %val46, undef
+  br i1 %cmp46, label %47, label %46
+  %val47 = volatile load float* undef
+  %cmp47 = fcmp une float %val47, undef
+  br i1 %cmp47, label %48, label %47
+  %val48 = volatile load float* undef
+  %cmp48 = fcmp une float %val48, undef
+  br i1 %cmp48, label %49, label %48
+  %val49 = volatile load float* undef
+  %cmp49 = fcmp une float %val49, undef
+  br i1 %cmp49, label %50, label %49
+  %val50 = volatile load float* undef
+  %cmp50 = fcmp une float %val50, undef
+  br i1 %cmp50, label %51, label %50
+  %val51 = volatile load float* undef
+  %cmp51 = fcmp une float %val51, undef
+  br i1 %cmp51, label %52, label %51
+  %val52 = volatile load float* undef
+  %cmp52 = fcmp une float %val52, undef
+  br i1 %cmp52, label %53, label %52
+  %val53 = volatile load float* undef
+  %cmp53 = fcmp une float %val53, undef
+  br i1 %cmp53, label %54, label %53
+  %val54 = volatile load float* undef
+  %cmp54 = fcmp une float %val54, undef
+  br i1 %cmp54, label %55, label %54
+  %val55 = volatile load float* undef
+  %cmp55 = fcmp une float %val55, undef
+  br i1 %cmp55, label %56, label %55
+  %val56 = volatile load float* undef
+  %cmp56 = fcmp une float %val56, undef
+  br i1 %cmp56, label %57, label %56
+  %val57 = volatile load float* undef
+  %cmp57 = fcmp une float %val57, undef
+  br i1 %cmp57, label %58, label %57
+  %val58 = volatile load float* undef
+  %cmp58 = fcmp une float %val58, undef
+  br i1 %cmp58, label %59, label %58
+  %val59 = volatile load float* undef
+  %cmp59 = fcmp une float %val59, undef
+  br i1 %cmp59, label %60, label %59
+  %val60 = volatile load float* undef
+  %cmp60 = fcmp une float %val60, undef
+  br i1 %cmp60, label %61, label %60
+  %val61 = volatile load float* undef
+  %cmp61 = fcmp une float %val61, undef
+  br i1 %cmp61, label %62, label %61
+  %val62 = volatile load float* undef
+  %cmp62 = fcmp une float %val62, undef
+  br i1 %cmp62, label %63, label %62
+  %val63 = volatile load float* undef
+  %cmp63 = fcmp une float %val63, undef
+  br i1 %cmp63, label %64, label %63
+  %val64 = volatile load float* undef
+  %cmp64 = fcmp une float %val64, undef
+  br i1 %cmp64, label %65, label %64
+
+  br label %exit
+exit:
+  ret void
+}
diff --git a/test/CodeGen/X86/btq.ll b/test/CodeGen/X86/btq.ll
new file mode 100644
index 0000000..9c137a7
--- /dev/null
+++ b/test/CodeGen/X86/btq.ll
@@ -0,0 +1,35 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+declare void @bar()
+
+define void @test1(i64 %foo) nounwind {
+  %and = and i64 %foo, 4294967296
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+; CHECK: test1:
+; CHECK: btq $32
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
+
+define void @test2(i64 %foo) nounwind {
+  %and = and i64 %foo, 2147483648
+  %tobool = icmp eq i64 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+; CHECK: test2:
+; CHECK: testl $-2147483648
+
+if.then:
+  tail call void @bar() nounwind
+  br label %if.end
+
+if.end:
+  ret void
+}
diff --git a/test/CodeGen/X86/dbg-subrange.ll b/test/CodeGen/X86/dbg-subrange.ll
new file mode 100644
index 0000000..788910c
--- /dev/null
+++ b/test/CodeGen/X86/dbg-subrange.ll
@@ -0,0 +1,37 @@
+; RUN: llc -O0 < %s | FileCheck %s
+; Radar 10464995
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.7.2"
+
+@s = common global [4294967296 x i8] zeroinitializer, align 16
+;CHECK: .long	4294967295
+
+define void @bar() nounwind uwtable ssp {
+entry:
+  store i8 97, i8* getelementptr inbounds ([4294967296 x i8]* @s, i32 0, i64 0), align 1, !dbg !18
+  ret void, !dbg !20
+}
+
+!llvm.dbg.cu = !{!0}
+
+!0 = metadata !{i32 720913, i32 0, i32 12, metadata !"small.c", metadata !"/private/tmp", metadata !"clang version 3.1 (trunk 144833)", i1 true, i1 false, metadata !"", i32 0, metadata !1, metadata !1, metadata !3, metadata !11} ; [ DW_TAG_compile_unit ]
+!1 = metadata !{metadata !2}
+!2 = metadata !{i32 0}
+!3 = metadata !{metadata !4}
+!4 = metadata !{metadata !5}
+!5 = metadata !{i32 720942, i32 0, metadata !6, metadata !"bar", metadata !"bar", metadata !"", metadata !6, i32 4, metadata !7, i1 false, i1 true, i32 0, i32 0, i32 0, i32 256, i1 false, void ()* @bar, null, null, metadata !9} ; [ DW_TAG_subprogram ]
+!6 = metadata !{i32 720937, metadata !"small.c", metadata !"/private/tmp", null} ; [ DW_TAG_file_type ]
+!7 = metadata !{i32 720917, i32 0, metadata !"", i32 0, i32 0, i64 0, i64 0, i32 0, i32 0, i32 0, metadata !8, i32 0, i32 0} ; [ DW_TAG_subroutine_type ]
+!8 = metadata !{null}
+!9 = metadata !{metadata !10}
+!10 = metadata !{i32 720932}                      ; [ DW_TAG_base_type ]
+!11 = metadata !{metadata !12}
+!12 = metadata !{metadata !13}
+!13 = metadata !{i32 720948, i32 0, null, metadata !"s", metadata !"s", metadata !"", metadata !6, i32 2, metadata !14, i32 0, i32 1, [4294967296 x i8]* @s} ; [ DW_TAG_variable ]
+!14 = metadata !{i32 720897, null, metadata !"", null, i32 0, i64 34359738368, i64 8, i32 0, i32 0, metadata !15, metadata !16, i32 0, i32 0} ; [ DW_TAG_array_type ]
+!15 = metadata !{i32 720932, null, metadata !"char", null, i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ]
+!16 = metadata !{metadata !17}
+!17 = metadata !{i32 720929, i64 0, i64 4294967295} ; [ DW_TAG_subrange_type ]
+!18 = metadata !{i32 5, i32 3, metadata !19, null}
+!19 = metadata !{i32 720907, metadata !5, i32 4, i32 1, metadata !6, i32 0} ; [ DW_TAG_lexical_block ]
+!20 = metadata !{i32 6, i32 1, metadata !19, null}
diff --git a/test/CodeGen/X86/dec-eflags-lower.ll b/test/CodeGen/X86/dec-eflags-lower.ll
new file mode 100644
index 0000000..458160a
--- /dev/null
+++ b/test/CodeGen/X86/dec-eflags-lower.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+%struct.obj = type { i64 }
+
+define void @_Z7releaseP3obj(%struct.obj* nocapture %o) nounwind uwtable ssp {
+entry:
+; CHECK: decq	(%{{rdi|rcx}})
+; CHECK-NEXT: je
+  %refcnt = getelementptr inbounds %struct.obj* %o, i64 0, i32 0
+  %0 = load i64* %refcnt, align 8, !tbaa !0
+  %dec = add i64 %0, -1
+  store i64 %dec, i64* %refcnt, align 8, !tbaa !0
+  %tobool = icmp eq i64 %dec, 0
+  br i1 %tobool, label %if.end, label %return
+
+if.end:                                           ; preds = %entry
+  %1 = bitcast %struct.obj* %o to i8*
+  tail call void @free(i8* %1)
+  br label %return
+
+return:                                           ; preds = %entry, %if.end
+  ret void
+}
+
+declare void @free(i8* nocapture) nounwind
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/X86/fast-isel-gep.ll b/test/CodeGen/X86/fast-isel-gep.ll
index 91d1f5d..f0375f8 100644
--- a/test/CodeGen/X86/fast-isel-gep.ll
+++ b/test/CodeGen/X86/fast-isel-gep.ll
@@ -82,9 +82,8 @@ define i64 @test5(i8* %A, i32 %I, i64 %B) nounwind {
   ret i64 %v11
 ; X64: test5:
 ; X64: movslq	%e[[A1]], %rax
-; X64-NEXT: movq	(%r[[A0]],%rax), %rax
-; X64-NEXT: addq	%{{rdx|r8}}, %rax
-; X64-NEXT: ret
+; X64-NEXT: (%r[[A0]],%rax),
+; X64: ret
 }
 
 ; PR9500, rdar://9156159 - Don't do non-local address mode folding,
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index 6a5a102..377fd11 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -82,7 +82,7 @@ entry:
   ret i64 %mul
 
 ; CHECK: test6:
-; CHECK: leaq	(,%rdi,8), %rax
+; CHECK: shlq	$3, %rdi
 }
 
 define i32 @test7(i32 %x) nounwind ssp {
@@ -90,7 +90,7 @@ entry:
   %mul = mul nsw i32 %x, 8
   ret i32 %mul
 ; CHECK: test7:
-; CHECK: leal	(,%rdi,8), %eax
+; CHECK: shll	$3, %edi
 }
 
 
diff --git a/test/CodeGen/X86/phaddsub.ll b/test/CodeGen/X86/phaddsub.ll
new file mode 100644
index 0000000..62d85f7
--- /dev/null
+++ b/test/CodeGen/X86/phaddsub.ll
@@ -0,0 +1,170 @@
+; RUN: llc < %s -march=x86-64 -mattr=+ssse3,-avx | FileCheck %s -check-prefix=SSSE3
+; RUN: llc < %s -march=x86-64 -mattr=-ssse3,+avx | FileCheck %s -check-prefix=AVX
+
+; SSSE3: phaddw1:
+; SSSE3-NOT: vphaddw
+; SSSE3: phaddw
+; AVX: phaddw1:
+; AVX: vphaddw
+define <8 x i16> @phaddw1(<8 x i16> %x, <8 x i16> %y) {
+  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %r = add <8 x i16> %a, %b
+  ret <8 x i16> %r
+}
+
+; SSSE3: phaddw2:
+; SSSE3-NOT: vphaddw
+; SSSE3: phaddw
+; AVX: phaddw2:
+; AVX: vphaddw
+define <8 x i16> @phaddw2(<8 x i16> %x, <8 x i16> %y) {
+  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 2, i32 5, i32 6, i32 9, i32 10, i32 13, i32 14>
+  %b = shufflevector <8 x i16> %y, <8 x i16> %x, <8 x i32> <i32 8, i32 11, i32 12, i32 15, i32 0, i32 3, i32 4, i32 7>
+  %r = add <8 x i16> %a, %b
+  ret <8 x i16> %r
+}
+
+; SSSE3: phaddd1:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd1:
+; AVX: vphaddd
+define <4 x i32> @phaddd1(<4 x i32> %x, <4 x i32> %y) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phaddd2:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd2:
+; AVX: vphaddd
+define <4 x i32> @phaddd2(<4 x i32> %x, <4 x i32> %y) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 2, i32 5, i32 6>
+  %b = shufflevector <4 x i32> %y, <4 x i32> %x, <4 x i32> <i32 4, i32 7, i32 0, i32 3>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phaddd3:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd3:
+; AVX: vphaddd
+define <4 x i32> @phaddd3(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phaddd4:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd4:
+; AVX: vphaddd
+define <4 x i32> @phaddd4(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phaddd5:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd5:
+; AVX: vphaddd
+define <4 x i32> @phaddd5(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 3, i32 undef, i32 undef>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 2, i32 undef, i32 undef>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phaddd6:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd6:
+; AVX: vphaddd
+define <4 x i32> @phaddd6(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phaddd7:
+; SSSE3-NOT: vphaddd
+; SSSE3: phaddd
+; AVX: phaddd7:
+; AVX: vphaddd
+define <4 x i32> @phaddd7(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 undef, i32 undef>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 undef, i32 undef>
+  %r = add <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phsubw1:
+; SSSE3-NOT: vphsubw
+; SSSE3: phsubw
+; AVX: phsubw1:
+; AVX: vphsubw
+define <8 x i16> @phsubw1(<8 x i16> %x, <8 x i16> %y) {
+  %a = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 0, i32 2, i32 4, i32 6, i32 8, i32 10, i32 12, i32 14>
+  %b = shufflevector <8 x i16> %x, <8 x i16> %y, <8 x i32> <i32 1, i32 3, i32 5, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %r = sub <8 x i16> %a, %b
+  ret <8 x i16> %r
+}
+
+; SSSE3: phsubd1:
+; SSSE3-NOT: vphsubd
+; SSSE3: phsubd
+; AVX: phsubd1:
+; AVX: vphsubd
+define <4 x i32> @phsubd1(<4 x i32> %x, <4 x i32> %y) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 0, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x i32> %x, <4 x i32> %y, <4 x i32> <i32 1, i32 3, i32 5, i32 7>
+  %r = sub <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phsubd2:
+; SSSE3-NOT: vphsubd
+; SSSE3: phsubd
+; AVX: phsubd2:
+; AVX: vphsubd
+define <4 x i32> @phsubd2(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 2, i32 4, i32 6>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 undef, i32 3, i32 5, i32 7>
+  %r = sub <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phsubd3:
+; SSSE3-NOT: vphsubd
+; SSSE3: phsubd
+; AVX: phsubd3:
+; AVX: vphsubd
+define <4 x i32> @phsubd3(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 2, i32 undef, i32 undef>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 3, i32 undef, i32 undef>
+  %r = sub <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
+
+; SSSE3: phsubd4:
+; SSSE3-NOT: vphsubd
+; SSSE3: phsubd
+; AVX: phsubd4:
+; AVX: vphsubd
+define <4 x i32> @phsubd4(<4 x i32> %x) {
+  %a = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 0, i32 undef, i32 undef, i32 undef>
+  %b = shufflevector <4 x i32> %x, <4 x i32> undef, <4 x i32> <i32 1, i32 undef, i32 undef, i32 undef>
+  %r = sub <4 x i32> %a, %b
+  ret <4 x i32> %r
+}
diff --git a/test/CodeGen/X86/pr11202.ll b/test/CodeGen/X86/pr11202.ll
new file mode 100644
index 0000000..2b26a69
--- /dev/null
+++ b/test/CodeGen/X86/pr11202.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+
+@bb = constant [1 x i8*] [i8* blockaddress(@main, %l2)]
+
+define void @main() {
+entry:
+  br label %l1
+
+l1:                                               ; preds = %l2, %entry
+  %a = zext i1 false to i32
+  br label %l2
+
+l2:                                               ; preds = %l1
+  %b = zext i1 false to i32
+  br label %l1
+}
+
+; CHECK: .Ltmp1:                                 # Address of block that was removed by CodeGen
+; CHECK: .quad	.Ltmp1
diff --git a/test/CodeGen/X86/pr11415.ll b/test/CodeGen/X86/pr11415.ll
new file mode 100644
index 0000000..e1fa032
--- /dev/null
+++ b/test/CodeGen/X86/pr11415.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple=x86_64-pc-linux %s -o - -regalloc=fast | FileCheck %s
+
+; We used to consider the early clobber in the second asm statement as
+; defining %0 before it was read. This caused us to omit the
+; movq	-8(%rsp), %rdx
+
+; CHECK: 	#APP
+; CHECK-NEXT:	#NO_APP
+; CHECK-NEXT:	movq	%rcx, %rax
+; CHECK-NEXT:	movq	%rax, -8(%rsp)
+; CHECK-NEXT:	movq	-8(%rsp), %rdx
+; CHECK-NEXT:	#APP
+; CHECK-NEXT:	#NO_APP
+; CHECK-NEXT:	movq	%rdx, %rax
+; CHECK-NEXT:	movq	%rdx, -8(%rsp)
+; CHECK-NEXT:	ret
+
+define i64 @foo() {
+entry:
+  %0 = tail call i64 asm "", "={cx}"() nounwind
+  %1 = tail call i64 asm "", "=&r,0,r,~{rax}"(i64 %0, i64 %0) nounwind
+  ret i64 %1
+}
diff --git a/test/CodeGen/X86/splat-scalar-load.ll b/test/CodeGen/X86/splat-scalar-load.ll
index 2b13029..81a072f 100644
--- a/test/CodeGen/X86/splat-scalar-load.ll
+++ b/test/CodeGen/X86/splat-scalar-load.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin -mattr=+sse2 | FileCheck %s
 ; rdar://7434544
 
-define <2 x i64> @t2() nounwind ssp {
+define <2 x i64> @t2() nounwind {
 entry:
 ; CHECK: t2:
 ; CHECK: pshufd	$85, (%esp), %xmm0
diff --git a/test/CodeGen/X86/vec_extract-sse4.ll b/test/CodeGen/X86/vec_extract-sse4.ll
index f487654..42d7f27 100644
--- a/test/CodeGen/X86/vec_extract-sse4.ll
+++ b/test/CodeGen/X86/vec_extract-sse4.ll
@@ -2,7 +2,7 @@
 ; RUN: not grep extractps   %t
 ; RUN: not grep pextrd      %t
 ; RUN: not grep pshufd  %t
-; RUN: grep movss   %t | count 2
+; RUN: not grep movss   %t
 
 define void @t1(float* %R, <4 x float>* %P1) nounwind {
 	%X = load <4 x float>* %P1
diff --git a/test/CodeGen/X86/vec_shuffle-39.ll b/test/CodeGen/X86/vec_shuffle-39.ll
index 64508b5..55531e3 100644
--- a/test/CodeGen/X86/vec_shuffle-39.ll
+++ b/test/CodeGen/X86/vec_shuffle-39.ll
@@ -67,3 +67,20 @@ bb:
   %tmp7 = shufflevector <2 x double> %tmp5, <2 x double> %tmp6, <2 x i32> <i32 2, i32 1>
   ret <2 x double> %tmp7
 }
+
+; rdar://10450317
+define <2 x i64> @t4() nounwind readonly {
+bb:
+; CHECK: t4:
+; CHECK: punpcklqdq %xmm0, %xmm1
+; CHECK: movq (%rax), %xmm0
+; CHECK: movsd %xmm1, %xmm0
+  %tmp0 = load i128* null, align 1
+  %tmp1 = load <2 x i32>* undef, align 8
+  %tmp2 = bitcast i128 %tmp0 to <16 x i8>
+  %tmp3 = bitcast <2 x i32> %tmp1 to i64
+  %tmp4 = insertelement <2 x i64> undef, i64 %tmp3, i32 0
+  %tmp5 = bitcast <16 x i8> %tmp2 to <2 x i64>
+  %tmp6 = shufflevector <2 x i64> %tmp4, <2 x i64> %tmp5, <2 x i32> <i32 2, i32 1>
+  ret <2 x i64> %tmp6
+}
diff --git a/test/CodeGen/X86/widen_shuffle-1.ll b/test/CodeGen/X86/widen_shuffle-1.ll
index 2df3b6a..24608d0 100644
--- a/test/CodeGen/X86/widen_shuffle-1.ll
+++ b/test/CodeGen/X86/widen_shuffle-1.ll
@@ -54,3 +54,11 @@ define <8 x i8> @shuf4(<4 x i8> %a, <4 x i8> %b) nounwind readnone {
   %vshuf = shufflevector <4 x i8> %a, <4 x i8> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ret <8 x i8> %vshuf
 }
+
+; PR11389: another CONCAT_VECTORS case
+define void @shuf5(<8 x i8>* %p) nounwind {
+; CHECK: shuf5:
+  %v = shufflevector <2 x i8> <i8 4, i8 33>, <2 x i8> undef, <8 x i32> <i32 1, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  store <8 x i8> %v, <8 x i8>* %p, align 8
+  ret void
+}
diff --git a/test/Instrumentation/AddressSanitizer/bug_11395.ll b/test/Instrumentation/AddressSanitizer/bug_11395.ll
new file mode 100644
index 0000000..c53c385
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/bug_11395.ll
@@ -0,0 +1,71 @@
+; RUN: opt < %s -asan -S | llc -o /dev/null
+; The bug manifests as a reg alloc failure:
+; error: ran out of registers during register allocation
+; ModuleID = 'z.o'
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32-S128"
+target triple = "i386-unknown-linux-gnu"
+
+%struct.DSPContext = type { void (i16*, i8*, i32)*, void (i16*, i8*, i8*, i32)*, void (i16*, i8*, i32)*, void (i16*, i8*, i32)*, void (i16*, i8*, i32)*, void (i8*, i16*, i32)*, void (i8*, i16*, i32)*, i32 (i16*)*, void (i8*, i8*, i32, i32, i32, i32, i32)*, void (i8*, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32)*, void (i16*)*, void (i16*)*, i32 (i8*, i32)*, i32 (i8*, i32)*, [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], [6 x i32 (i8*, i8*, i8*, i32, i32)*], i32 (i8*, i16*, i32)*, [4 x [4 x void (i8*, i8*, i32, i32)*]], [4 x [4 x void (i8*, i8*, i32, i32)*]], [4 x [4 x void (i8*, i8*, i32, i32)*]], [4 x [4 x void (i8*, i8*, i32, i32)*]], [2 x void (i8*, i8*, i8*, i32, i32)*], [11 x void (i8*, i8*, i32, i32, i32)*], [11 x void (i8*, i8*, i32, i32, i32)*], [2 x [16 x void (i8*, i8*, i32)*]], [2 x [16 x void (i8*, i8*, i32)*]], [2 x [16 x void (i8*, i8*, i32)*]], [2 x [16 x void (i8*, i8*, i32)*]], [8 x void (i8*, i8*, i32)*], [3 x void (i8*, i8*, i32, i32, i32, i32)*], [3 x void (i8*, i8*, i32, i32, i32, i32)*], [3 x void (i8*, i8*, i32, i32, i32, i32)*], [3 x void (i8*, i8*, i32, i32, i32, i32)*], [4 x [16 x void (i8*, i8*, i32)*]], [4 x [16 x void (i8*, i8*, i32)*]], [4 x [16 x void (i8*, i8*, i32)*]], [4 x [16 x void (i8*, i8*, i32)*]], [10 x void (i8*, i32, i32, i32, i32)*], [10 x void (i8*, i8*, i32, i32, i32, i32, i32)*], [2 x [16 x void (i8*, i8*, i32)*]], [2 x [16 x void (i8*, i8*, i32)*]], void (i8*, i32, i32, i32, i32, i32, i32)*, void (i8*, i32, i32, i32, i32, i32, i32)*, void (i8*, i32, i32, i32, i32, i32, i32)*, void (i8*, i32, i32, i32, i32, i32, i32)*, void (i8*, i16*, i32)*, [2 x [4 x i32 (i8*, i8*, i8*, i32, i32)*]], void (i8*, i8*, i32)*, void (i8*, i8*, i8*, i32)*, void (i8*, i8*, i8*, i32)*, void (i8*, i8*, i8*, i32, i32*, i32*)*, void (i8*, i8*, i8*, i32, i32*, i32*)*, i32 (i8*, i8*, i32, i32)*, void (i8*, i8*, i32, i32*, i32*, i32*)*, void (i8*, i8*, i8*, i32, i32)*, void (i32*, i32*, i32)*, void (i8*, i32, i32, i32, i8*)*, void (i8*, i32, i32, i32, i8*)*, void (i8*, i32, i32, i32)*, void (i8*, i32, i32, i32)*, void (i8*, i32, i32, i32, i8*)*, void (i8*, i32, i32, i32, i8*)*, void (i8*, i32, i32, i32)*, void (i8*, i32, i32, i32)*, void ([4 x [4 x i16]]*, i8*, [40 x i8]*, [40 x [2 x i16]]*, i32, i32, i32, i32, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32*)*, void (i8*, i32, i32*)*, void (i8*, i8*, i32, i16*, i16*)*, void (float*, float*, i32)*, void ([256 x float]*, [2 x float]*, i32, i32, i32)*, void (i32*, i32, i32, double*)*, void (float*, float*, i32)*, void (float*, float*, float*, i32)*, void (float*, float*, float*, float*, i32)*, void (float*, float*, float*, float*, float, i32)*, void (float*, i32*, float, i32)*, void (float*, float*, float, float, i32)*, void (float*, float*, float, i32)*, [2 x void (float*, float*, float**, float, i32)*], [2 x void (float*, float**, float, i32)*], float (float*, float*, i32)*, void (float*, float*, i32)*, void (i16*, float*, i32)*, void (i16*, float**, i32, i32)*, void (i16*)*, void (i16*)*, void (i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, [64 x i8], i32, i32 (i16*, i16*, i16*, i32)*, void (i16*, i16*, i32)*, void (i8*, i32, i32, i32, i32)*, void (i8*, i16*, i32)*, void (i8*, i16*, i32)*, void (i8*, i16*, i32)*, void (i8*, i16*, i32)*, void ([4 x i16]*)*, void (i8*, i32*, i16*, i32, i8*)*, void (i8*, i32*, i16*, i32, i8*)*, void (i8**, i32*, i16*, i32, i8*)*, void (i8*, i32*, i16*, i32, i8*)*, void (i16*, i16*, i16*, i16*, i16*, i16*, i32)*, void (i16*, i32)*, void (i8*, i32, i8**, i32, i32, i32, i32, i32, %struct.slice_buffer_s*, i32, i8*)*, void (i8*, i32, i32)*, [4 x void (i8*, i32, i8*, i32, i32, i32)*], void (i32*, i32*, i32, i32, i32, i32, i32, i32*)*, void (i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, void (i8*, i32, i16*)*, void (i8*, i32)*, void (i8*, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, void (i8*, i32, i32)*, [16 x void (i8*, i8*, i32, i32)*], [16 x void (i8*, i8*, i32, i32)*], [12 x void (i8*, i8*, i32)*], void (i8*, i8*, i32, i32*, i32*, i32)*, void (i16*, i16*, i32)*, void (i16*, i16*, i32)*, i32 (i16*, i16*, i32, i32)*, [4 x [16 x void (i8*, i8*, i32)*]], [4 x [16 x void (i8*, i8*, i32)*]], [4 x [16 x void (i8*, i8*, i32)*]], [4 x [16 x void (i8*, i8*, i32)*]], [3 x void (i8*, i8*, i32, i32, i32, i32)*], [3 x void (i8*, i8*, i32, i32, i32, i32)*] }
+%struct.slice_buffer_s = type opaque
+%struct.AVCodecContext = type { %struct.AVClass*, i32, i32, i32, i32, i32, i8*, i32, %struct.AVRational, i32, i32, i32, i32, i32, void (%struct.AVCodecContext*, %struct.AVFrame*, i32*, i32, i32, i32)*, i32, i32, i32, i32, i32, i32, i32, float, float, i32, i32, i32, i32, float, i32, i32, i32, %struct.AVCodec*, i8*, i32, void (%struct.AVCodecContext*, i8*, i32, i32)*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8*, [32 x i8], i32, i32, i32, i32, i32, i32, i32, float, i32, i32 (%struct.AVCodecContext*, %struct.AVFrame*)*, void (%struct.AVCodecContext*, %struct.AVFrame*)*, i32, i32, i32, i32, i8*, i8*, float, float, i32, %struct.RcOverride*, i32, i8*, i32, i32, i32, float, float, float, float, i32, float, float, float, float, float, i32, i32, i32*, i32, i32, i32, i32, %struct.AVRational, %struct.AVFrame*, i32, i32, [4 x i64], i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 (%struct.AVCodecContext*, i32*)*, i32, i32, i32, i32, i32, i32, i8*, i32, i32, i32, i32, i32, i32, i16*, i16*, i32, i32, i32, i32, %struct.AVPaletteControl*, i32, i32 (%struct.AVCodecContext*, %struct.AVFrame*)*, i32, i32, i32, i32, i32, i32, i32, i32 (%struct.AVCodecContext*, i32 (%struct.AVCodecContext*, i8*)*, i8*, i32*, i32, i32)*, i8*, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, float, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i64, i32, float, i64, i32, i64, i64, float, float, %struct.AVHWAccel*, i32, i8*, i32, i32, i32, i32, i32, i32 (%struct.AVCodecContext*, i32 (%struct.AVCodecContext*, i8*, i32, i32)*, i8*, i32*, i32)*, i32, i32, i32, i32, i32, i32, i8*, float, float, float, float, i32, i32, i32, float, float, float, i32, i32, i32, i32, [4 x i32], i8*, i32, i32, i32, i32 }
+%struct.AVClass = type { i8*, i8* (i8*)*, %struct.AVOption* }
+%struct.AVOption = type opaque
+%struct.AVRational = type { i32, i32 }
+%struct.AVFrame = type { [4 x i8*], [4 x i32], [4 x i8*], i32, i32, i64, i32, i32, i32, i32, i32, i8*, i32, i8*, [2 x [2 x i16]*], i32*, i8, i8*, [4 x i64], i32, i32, i32, i32, i32, %struct.AVPanScan*, i32, i32, i16*, [2 x i8*], i64, i8* }
+%struct.AVPanScan = type { i32, i32, i32, [3 x [2 x i16]] }
+%struct.AVCodec = type { i8*, i32, i32, i32, i32 (%struct.AVCodecContext*)*, i32 (%struct.AVCodecContext*, i8*, i32, i8*)*, i32 (%struct.AVCodecContext*)*, i32 (%struct.AVCodecContext*, i8*, i32*, %struct.AVPacket*)*, i32, %struct.AVCodec*, void (%struct.AVCodecContext*)*, %struct.AVRational*, i32*, i8*, i32*, i32*, i64* }
+%struct.AVPacket = type { i64, i64, i8*, i32, i32, i32, i32, void (%struct.AVPacket*)*, i8*, i64, i64 }
+%struct.RcOverride = type { i32, i32, i32, float }
+%struct.AVPaletteControl = type { i32, [256 x i32] }
+%struct.AVHWAccel = type { i8*, i32, i32, i32, i32, %struct.AVHWAccel*, i32 (%struct.AVCodecContext*, i8*, i32)*, i32 (%struct.AVCodecContext*, i8*, i32)*, i32 (%struct.AVCodecContext*)*, i32 }
+
+@firtable = internal unnamed_addr constant [9 x i8*] [i8* @ff_mlp_firorder_0, i8* @ff_mlp_firorder_1, i8* @ff_mlp_firorder_2, i8* @ff_mlp_firorder_3, i8* @ff_mlp_firorder_4, i8* @ff_mlp_firorder_5, i8* @ff_mlp_firorder_6, i8* @ff_mlp_firorder_7, i8* @ff_mlp_firorder_8], align 4
+@iirtable = internal unnamed_addr constant [5 x i8*] [i8* @ff_mlp_iirorder_0, i8* @ff_mlp_iirorder_1, i8* @ff_mlp_iirorder_2, i8* @ff_mlp_iirorder_3, i8* @ff_mlp_iirorder_4], align 4
+@ff_mlp_iirorder_0 = external global i8
+@ff_mlp_iirorder_1 = external global i8
+@ff_mlp_iirorder_2 = external global i8
+@ff_mlp_iirorder_3 = external global i8
+@ff_mlp_iirorder_4 = external global i8
+@ff_mlp_firorder_0 = external global i8
+@ff_mlp_firorder_1 = external global i8
+@ff_mlp_firorder_2 = external global i8
+@ff_mlp_firorder_3 = external global i8
+@ff_mlp_firorder_4 = external global i8
+@ff_mlp_firorder_5 = external global i8
+@ff_mlp_firorder_6 = external global i8
+@ff_mlp_firorder_7 = external global i8
+@ff_mlp_firorder_8 = external global i8
+
+define void @ff_mlp_init_x86(%struct.DSPContext* nocapture %c, %struct.AVCodecContext* nocapture %avctx) nounwind {
+entry:
+  %mlp_filter_channel = getelementptr inbounds %struct.DSPContext* %c, i32 0, i32 131
+  store void (i32*, i32*, i32, i32, i32, i32, i32, i32*)* @mlp_filter_channel_x86, void (i32*, i32*, i32, i32, i32, i32, i32, i32*)** %mlp_filter_channel, align 4, !tbaa !0
+  ret void
+}
+
+define internal void @mlp_filter_channel_x86(i32* %state, i32* %coeff, i32 %firorder, i32 %iirorder, i32 %filter_shift, i32 %mask, i32 %blocksize, i32* %sample_buffer) nounwind {
+entry:
+  %filter_shift.addr = alloca i32, align 4
+  %mask.addr = alloca i32, align 4
+  %blocksize.addr = alloca i32, align 4
+  %firjump = alloca i8*, align 4
+  %iirjump = alloca i8*, align 4
+  store i32 %filter_shift, i32* %filter_shift.addr, align 4, !tbaa !3
+  store i32 %mask, i32* %mask.addr, align 4, !tbaa !3
+  %arrayidx = getelementptr inbounds [9 x i8*]* @firtable, i32 0, i32 %firorder
+  %0 = load i8** %arrayidx, align 4, !tbaa !0
+  store i8* %0, i8** %firjump, align 4, !tbaa !0
+  %arrayidx1 = getelementptr inbounds [5 x i8*]* @iirtable, i32 0, i32 %iirorder
+  %1 = load i8** %arrayidx1, align 4, !tbaa !0
+  store i8* %1, i8** %iirjump, align 4, !tbaa !0
+  %sub = sub nsw i32 0, %blocksize
+  store i32 %sub, i32* %blocksize.addr, align 4, !tbaa !3
+  %2 = call { i32*, i32*, i32* } asm sideeffect "1:                           \0A\09xor           %esi, %esi\0A\09xor           %ecx, %ecx\0A\09jmp  *$5                     \0A\09ff_mlp_firorder_8:            \0A\09mov   0x1c+0($0), %eax\0A\09imull 0x1c+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_7:            \0A\09mov   0x18+0($0), %eax\0A\09imull 0x18+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_6:            \0A\09mov   0x14+0($0), %eax\0A\09imull 0x14+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_5:            \0A\09mov   0x10+0($0), %eax\0A\09imull 0x10+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_4:            \0A\09mov   0x0c+0($0), %eax\0A\09imull 0x0c+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_3:            \0A\09mov   0x08+0($0), %eax\0A\09imull 0x08+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_2:            \0A\09mov   0x04+0($0), %eax\0A\09imull 0x04+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_1:            \0A\09mov   0x00+0($0), %eax\0A\09imull 0x00+0($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_firorder_0:\0A\09jmp  *$6                     \0A\09ff_mlp_iirorder_4:            \0A\09mov   0x0c+4*(8 + (40 * 4))($0), %eax\0A\09imull 0x0c+4* 8($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_iirorder_3:            \0A\09mov   0x08+4*(8 + (40 * 4))($0), %eax\0A\09imull 0x08+4* 8($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_iirorder_2:            \0A\09mov   0x04+4*(8 + (40 * 4))($0), %eax\0A\09imull 0x04+4* 8($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_iirorder_1:            \0A\09mov   0x00+4*(8 + (40 * 4))($0), %eax\0A\09imull 0x00+4* 8($1)       \0A\09add                %eax , %esi\0A\09adc                %edx , %ecx\0A\09ff_mlp_iirorder_0:\0A\09mov           %ecx, %edx\0A\09mov           %esi, %eax\0A\09movzbl        $7   , %ecx\0A\09shrd    %cl, %edx, %eax\0A\09mov  %eax  ,%edx      \0A\09add  ($2)      ,%eax     \0A\09and   $4       ,%eax     \0A\09sub   $$4       ,  $0         \0A\09mov  %eax, ($0)        \0A\09mov  %eax, ($2)        \0A\09add $$4* 8    ,  $2         \0A\09sub  %edx   ,%eax     \0A\09mov  %eax,4*(8 + (40 * 4))($0)  \0A\09incl              $3         \0A\09js 1b                        \0A\09", "=r,=r,=r,=*m,*m,*m,*m,*m,0,1,2,*m,~{eax},~{edx},~{esi},~{ecx},~{dirflag},~{fpsr},~{flags}"(i32* %blocksize.addr, i32* %mask.addr, i8** %firjump, i8** %iirjump, i32* %filter_shift.addr, i32* %state, i32* %coeff, i32* %sample_buffer, i32* %blocksize.addr) nounwind, !srcloc !4
+  ret void
+}
+
+!0 = metadata !{metadata !"any pointer", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
+!3 = metadata !{metadata !"int", metadata !1}
+!4 = metadata !{i32 156132, i32 156164, i32 156205, i32 156238, i32 156282, i32 156332, i32 156370, i32 156408, i32 156447, i32 156486, i32 156536, i32 156574, i32 156612, i32 156651, i32 156690, i32 156740, i32 156778, i32 156816, i32 156855, i32 156894, i32 156944, i32 156982, i32 157020, i32 157059, i32 157098, i32 157148, i32 157186, i32 157224, i32 157263, i32 157302, i32 157352, i32 157390, i32 157428, i32 157467, i32 157506, i32 157556, i32 157594, i32 157632, i32 157671, i32 157710, i32 157760, i32 157798, i32 157836, i32 157875, i32 157914, i32 157952, i32 157996, i32 158046, i32 158099, i32 158140, i32 158179, i32 158218, i32 158268, i32 158321, i32 158362, i32 158401, i32 158440, i32 158490, i32 158543, i32 158584, i32 158623, i32 158662, i32 158712, i32 158765, i32 158806, i32 158845, i32 158884, i32 158922, i32 158963, i32 158996, i32 159029, i32 159062, i32 159109, i32 159154, i32 159199, i32 159243, i32 159286, i32 159329, i32 159375, i32 159422, i32 159478, i32 159522, i32 159566}
diff --git a/test/Instrumentation/AddressSanitizer/dg.exp b/test/Instrumentation/AddressSanitizer/dg.exp
new file mode 100644
index 0000000..f200589
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/dg.exp
@@ -0,0 +1,3 @@
+load_lib llvm.exp
+
+RunLLVMTests [lsort [glob -nocomplain $srcdir/$subdir/*.{ll,c,cpp}]]
diff --git a/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll b/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll
new file mode 100644
index 0000000..1687877
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-touch-odr-global.ll
@@ -0,0 +1,6 @@
+; RUN: opt < %s -asan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; no action should be taken for these globals
+@v1 = linkonce_odr constant i8 1
+; CHECK-NOT: __asan_register_globals
diff --git a/test/Instrumentation/AddressSanitizer/do-not-touch-threadlocal.ll b/test/Instrumentation/AddressSanitizer/do-not-touch-threadlocal.ll
new file mode 100644
index 0000000..89644d4
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/do-not-touch-threadlocal.ll
@@ -0,0 +1,6 @@
+; RUN: opt < %s -asan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+; no action should be taken for thread locals
+@xxx = thread_local global i32 0, align 4
+; CHECK-NOT: __asan_register_globals
diff --git a/test/Instrumentation/AddressSanitizer/test64.ll b/test/Instrumentation/AddressSanitizer/test64.ll
new file mode 100644
index 0000000..e26fb3d
--- /dev/null
+++ b/test/Instrumentation/AddressSanitizer/test64.ll
@@ -0,0 +1,14 @@
+; RUN: opt < %s -asan -S | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
+target triple = "x86_64-unknown-linux-gnu"
+define i32 @read_4_bytes(i32* %a) {
+entry:
+  %tmp1 = load i32* %a, align 4
+  ret i32 %tmp1
+}
+; CHECK: @read_4_bytes
+; CHECK-NOT: ret
+; CHECK: lshr {{.*}} 3
+; Check for ASAN's Offset for 64-bit (2^44)
+; CHECK-NEXT: 17592186044416
+; CHECK: ret
diff --git a/test/MC/ARM/basic-arm-instructions.s b/test/MC/ARM/basic-arm-instructions.s
index 22ad3cd..133967b 100644
--- a/test/MC/ARM/basic-arm-instructions.s
+++ b/test/MC/ARM/basic-arm-instructions.s
@@ -133,9 +133,9 @@ Lforward:
         adr	r2, #-3
 
 @ CHECK: Lback:
-@ CHECK: adr	r2, Lback    @ encoding: [0bAAAAAAA0,0x20'A',0x0f'A',0b1110001A]
+@ CHECK: adr	r2, Lback    @ encoding: [A,0x20'A',0x0f'A',0xe2'A']
 @ CHECK:  @   fixup A - offset: 0, value: Lback, kind: fixup_arm_adr_pcrel_12
-@ CHECK: adr	r3, Lforward @ encoding: [0bAAAAAAA0,0x30'A',0x0f'A',0b1110001A]
+@ CHECK: adr	r3, Lforward @ encoding: [A,0x30'A',0x0f'A',0xe2'A']
 @ CHECK:  @   fixup A - offset: 0, value: Lforward, kind: fixup_arm_adr_pcrel_12
 @ CHECK: Lforward:
 @ CHECK: adr	r2, #3                  @ encoding: [0x03,0x20,0x8f,0xe2]
@@ -262,10 +262,12 @@ Lforward:
 	asr r2, r4, #32
 	asr r2, r4, #2
 	asr r2, r4, #0
+	asr r4, #2
 
 @ CHECK: asr	r2, r4, #32             @ encoding: [0x44,0x20,0xa0,0xe1]
 @ CHECK: asr	r2, r4, #2              @ encoding: [0x44,0x21,0xa0,0xe1]
 @ CHECK: mov	r2, r4                  @ encoding: [0x04,0x20,0xa0,0xe1]
+@ CHECK: asr	r4, r4, #2              @ encoding: [0x44,0x41,0xa0,0xe1]
 
 
 @------------------------------------------------------------------------------
@@ -794,10 +796,12 @@ Lforward:
 	lsl r2, r4, #31
 	lsl r2, r4, #1
 	lsl r2, r4, #0
+	lsl r4, #1
 
 @ CHECK: lsl	r2, r4, #31             @ encoding: [0x84,0x2f,0xa0,0xe1]
 @ CHECK: lsl	r2, r4, #1              @ encoding: [0x84,0x20,0xa0,0xe1]
 @ CHECK: mov	r2, r4                  @ encoding: [0x04,0x20,0xa0,0xe1]
+@ CHECK: lsl	r4, r4, #1              @ encoding: [0x84,0x40,0xa0,0xe1]
 
 
 @------------------------------------------------------------------------------
@@ -806,10 +810,12 @@ Lforward:
 	lsr r2, r4, #32
 	lsr r2, r4, #2
 	lsr r2, r4, #0
+	lsr r4, #2
 
 @ CHECK: lsr	r2, r4, #32             @ encoding: [0x24,0x20,0xa0,0xe1]
 @ CHECK: lsr	r2, r4, #2              @ encoding: [0x24,0x21,0xa0,0xe1]
 @ CHECK: mov	r2, r4                  @ encoding: [0x04,0x20,0xa0,0xe1]
+@ CHECK: lsr	r4, r4, #2              @ encoding: [0x24,0x41,0xa0,0xe1]
 
 
 @------------------------------------------------------------------------------
@@ -1001,11 +1007,13 @@ Lforward:
         muls r5, r6, r7
         mulgt r5, r6, r7
         mulsle r5, r6, r7
+        mul r11, r5
 
 @ CHECK: mul	r5, r6, r7              @ encoding: [0x96,0x07,0x05,0xe0]
 @ CHECK: muls	r5, r6, r7              @ encoding: [0x96,0x07,0x15,0xe0]
 @ CHECK: mulgt	r5, r6, r7              @ encoding: [0x96,0x07,0x05,0xc0]
 @ CHECK: mulsle	r5, r6, r7              @ encoding: [0x96,0x07,0x15,0xd0]
+@ CHECK: mul	r11, r11, r5            @ encoding: [0x9b,0x05,0x0b,0xe0]
 
 
 @------------------------------------------------------------------------------
@@ -1344,10 +1352,12 @@ Lforward:
 	ror r2, r4, #31
 	ror r2, r4, #1
 	ror r2, r4, #0
+	ror r4, #1
 
 @ CHECK: ror	r2, r4, #31             @ encoding: [0xe4,0x2f,0xa0,0xe1]
 @ CHECK: ror	r2, r4, #1              @ encoding: [0xe4,0x20,0xa0,0xe1]
 @ CHECK: mov	r2, r4                  @ encoding: [0x04,0x20,0xa0,0xe1]
+@ CHECK: ror	r4, r4, #1              @ encoding: [0xe4,0x40,0xa0,0xe1]
 
 
 @------------------------------------------------------------------------------
diff --git a/test/MC/ARM/basic-thumb2-instructions.s b/test/MC/ARM/basic-thumb2-instructions.s
index 74b0681..0dbde19 100644
--- a/test/MC/ARM/basic-thumb2-instructions.s
+++ b/test/MC/ARM/basic-thumb2-instructions.s
@@ -1228,12 +1228,18 @@ _func:
         mul r3, r4, r6
         it eq
         muleq r3, r4, r5
+        it le
+        mulle r4, r4, r8
+        mul r6, r5
 
 @ CHECK: muls	r3, r4, r3              @ encoding: [0x63,0x43]
 @ CHECK: mul	r3, r4, r3              @ encoding: [0x04,0xfb,0x03,0xf3]
 @ CHECK: mul	r3, r4, r6              @ encoding: [0x04,0xfb,0x06,0xf3]
 @ CHECK: it	eq                      @ encoding: [0x08,0xbf]
 @ CHECK: muleq	r3, r4, r5              @ encoding: [0x04,0xfb,0x05,0xf3]
+@ CHECK: it	le                      @ encoding: [0xd8,0xbf]
+@ CHECK: mulle	r4, r4, r8              @ encoding: [0x04,0xfb,0x08,0xf4]
+@ CHECK: mul	r6, r6, r5              @ encoding: [0x06,0xfb,0x05,0xf6]
 
 
 @------------------------------------------------------------------------------
diff --git a/test/MC/ARM/neon-shuffle-encoding.s b/test/MC/ARM/neon-shuffle-encoding.s
index ce7eb66..ed209f7 100644
--- a/test/MC/ARM/neon-shuffle-encoding.s
+++ b/test/MC/ARM/neon-shuffle-encoding.s
@@ -44,3 +44,63 @@
 	vzip.16	q9, q8
 @ CHECK: vzip.32	q9, q8                  @ encoding: [0xe0,0x21,0xfa,0xf3]
 	vzip.32	q9, q8
+
+
+@ VTRN alternate size suffices
+
+        vtrn.8 d3, d9
+        vtrn.i8 d3, d9
+        vtrn.u8 d3, d9
+        vtrn.p8 d3, d9
+        vtrn.16 d3, d9
+        vtrn.i16 d3, d9
+        vtrn.u16 d3, d9
+        vtrn.p16 d3, d9
+        vtrn.32 d3, d9
+        vtrn.i32 d3, d9
+        vtrn.u32 d3, d9
+        vtrn.f32 d3, d9
+        vtrn.f d3, d9
+
+        vtrn.8 q14, q6
+        vtrn.i8 q14, q6
+        vtrn.u8 q14, q6
+        vtrn.p8 q14, q6
+        vtrn.16 q14, q6
+        vtrn.i16 q14, q6
+        vtrn.u16 q14, q6
+        vtrn.p16 q14, q6
+        vtrn.32 q14, q6
+        vtrn.i32 q14, q6
+        vtrn.u32 q14, q6
+        vtrn.f32 q14, q6
+        vtrn.f q14, q6
+
+@ CHECK: vtrn.8	d3, d9                  @ encoding: [0x89,0x30,0xb2,0xf3]
+@ CHECK: vtrn.8	d3, d9                  @ encoding: [0x89,0x30,0xb2,0xf3]
+@ CHECK: vtrn.8	d3, d9                  @ encoding: [0x89,0x30,0xb2,0xf3]
+@ CHECK: vtrn.8	d3, d9                  @ encoding: [0x89,0x30,0xb2,0xf3]
+@ CHECK: vtrn.16	d3, d9          @ encoding: [0x89,0x30,0xb6,0xf3]
+@ CHECK: vtrn.16	d3, d9          @ encoding: [0x89,0x30,0xb6,0xf3]
+@ CHECK: vtrn.16	d3, d9          @ encoding: [0x89,0x30,0xb6,0xf3]
+@ CHECK: vtrn.16	d3, d9          @ encoding: [0x89,0x30,0xb6,0xf3]
+@ CHECK: vtrn.32	d3, d9          @ encoding: [0x89,0x30,0xba,0xf3]
+@ CHECK: vtrn.32	d3, d9          @ encoding: [0x89,0x30,0xba,0xf3]
+@ CHECK: vtrn.32	d3, d9          @ encoding: [0x89,0x30,0xba,0xf3]
+@ CHECK: vtrn.32	d3, d9          @ encoding: [0x89,0x30,0xba,0xf3]
+@ CHECK: vtrn.32	d3, d9          @ encoding: [0x89,0x30,0xba,0xf3]
+
+@ CHECK: vtrn.8	q14, q6                 @ encoding: [0xcc,0xc0,0xf2,0xf3]
+@ CHECK: vtrn.8	q14, q6                 @ encoding: [0xcc,0xc0,0xf2,0xf3]
+@ CHECK: vtrn.8	q14, q6                 @ encoding: [0xcc,0xc0,0xf2,0xf3]
+@ CHECK: vtrn.8	q14, q6                 @ encoding: [0xcc,0xc0,0xf2,0xf3]
+@ CHECK: vtrn.16	q14, q6         @ encoding: [0xcc,0xc0,0xf6,0xf3]
+@ CHECK: vtrn.16	q14, q6         @ encoding: [0xcc,0xc0,0xf6,0xf3]
+@ CHECK: vtrn.16	q14, q6         @ encoding: [0xcc,0xc0,0xf6,0xf3]
+@ CHECK: vtrn.16	q14, q6         @ encoding: [0xcc,0xc0,0xf6,0xf3]
+@ CHECK: vtrn.32	q14, q6         @ encoding: [0xcc,0xc0,0xfa,0xf3]
+@ CHECK: vtrn.32	q14, q6         @ encoding: [0xcc,0xc0,0xfa,0xf3]
+@ CHECK: vtrn.32	q14, q6         @ encoding: [0xcc,0xc0,0xfa,0xf3]
+@ CHECK: vtrn.32	q14, q6         @ encoding: [0xcc,0xc0,0xfa,0xf3]
+@ CHECK: vtrn.32	q14, q6         @ encoding: [0xcc,0xc0,0xfa,0xf3]
+
diff --git a/test/MC/ARM/neon-vld-encoding.s b/test/MC/ARM/neon-vld-encoding.s
index 03a3cea..503b1ec 100644
--- a/test/MC/ARM/neon-vld-encoding.s
+++ b/test/MC/ARM/neon-vld-encoding.s
@@ -251,3 +251,11 @@
 @ CHECK: vld1.8	{d2, d3, d4}, [r2]      @ encoding: [0x0f,0x26,0x22,0xf4]
 @ CHECK: vld1.32 {d2, d3, d4}, [r2]     @ encoding: [0x8f,0x26,0x22,0xf4]
 @ CHECK: vld1.64 {d2, d3, d4}, [r2]     @ encoding: [0xcf,0x26,0x22,0xf4]
+
+
+@ Register lists can use the range syntax, just like VLDM
+	vld1.f64 {d2-d5}, [r2,:128]!
+	vld1.f64 {d2,d3,d4,d5}, [r2,:128]!
+
+@ CHECK: vld1.64 {d2, d3, d4, d5}, [r2, :128]! @ encoding: [0xed,0x22,0x22,0xf4]
+@ CHECK: vld1.64 {d2, d3, d4, d5}, [r2, :128]! @ encoding: [0xed,0x22,0x22,0xf4]
diff --git a/test/MC/ARM/nop-armv6t2-padding.s b/test/MC/ARM/nop-armv6t2-padding.s
index 0e25718..c38ad2d 100644
--- a/test/MC/ARM/nop-armv6t2-padding.s
+++ b/test/MC/ARM/nop-armv6t2-padding.s
@@ -7,4 +7,4 @@ x:
       .align 4
       add r0, r1, r2
 
-@ CHECK: ('_section_data', '020081e0 007820e3 007820e3 007820e3 020081e0')
+@ CHECK: ('_section_data', '020081e0 00f020e3 00f020e3 00f020e3 020081e0')
diff --git a/test/MC/ARM/prefetch.ll b/test/MC/ARM/prefetch.ll
deleted file mode 100644
index e77fdb1..0000000
--- a/test/MC/ARM/prefetch.ll
+++ /dev/null
@@ -1,58 +0,0 @@
-; RUN: llc < %s -mtriple=armv7-apple-darwin   -mattr=+v7,+mp -show-mc-encoding | FileCheck %s -check-prefix=ARM
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mattr=+v7     -show-mc-encoding | FileCheck %s -check-prefix=T2
-; rdar://8924681
-
-define void @t1(i8* %ptr) nounwind  {
-entry:
-; ARM: t1:
-; ARM: pldw [r0]                        @ encoding: [0x00,0xf0,0x90,0xf5]
-; ARM: pld [r0]                         @ encoding: [0x00,0xf0,0xd0,0xf5]
-
-; T2: t1:
-; T2: pld [r0]                      @ encoding: [0x90,0xf8,0x00,0xf0]
-  tail call void @llvm.prefetch( i8* %ptr, i32 1, i32 3 )
-  tail call void @llvm.prefetch( i8* %ptr, i32 0, i32 3 )
-  ret void
-}
-
-define void @t2(i8* %ptr) nounwind  {
-entry:
-; ARM: t2:
-; ARM: pld [r0, #1023]                  @ encoding: [0xff,0xf3,0xd0,0xf5]
-
-; T2: t2:
-; T2: pld [r0, #1023]               @ encoding: [0x90,0xf8,0xff,0xf3]
-  %tmp = getelementptr i8* %ptr, i32 1023
-  tail call void @llvm.prefetch( i8* %tmp, i32 0, i32 3 )
-  ret void
-}
-
-define void @t3(i32 %base, i32 %offset) nounwind  {
-entry:
-; ARM: t3:
-; ARM: pld [r0, r1, lsr #2]             @ encoding: [0x21,0xf1,0xd0,0xf7]
-
-; T2: t3:
-; T2: pld [r0, r1]                  @ encoding: [0x10,0xf8,0x01,0xf0]
-  %tmp1 = lshr i32 %offset, 2
-  %tmp2 = add i32 %base, %tmp1
-  %tmp3 = inttoptr i32 %tmp2 to i8*
-  tail call void @llvm.prefetch( i8* %tmp3, i32 0, i32 3 )
-  ret void
-}
-
-define void @t4(i32 %base, i32 %offset) nounwind  {
-entry:
-; ARM: t4:
-; ARM: pld [r0, r1, lsl #2]             @ encoding: [0x01,0xf1,0xd0,0xf7]
-
-; T2: t4:
-; T2: pld [r0, r1, lsl #2]          @ encoding: [0x10,0xf8,0x21,0xf0]
-  %tmp1 = shl i32 %offset, 2
-  %tmp2 = add i32 %base, %tmp1
-  %tmp3 = inttoptr i32 %tmp2 to i8*
-  tail call void @llvm.prefetch( i8* %tmp3, i32 0, i32 3 )
-  ret void
-}
-
-declare void @llvm.prefetch(i8*, i32, i32) nounwind 
diff --git a/test/MC/ARM/simple-fp-encoding.s b/test/MC/ARM/simple-fp-encoding.s
index 5c81c78..58abe88 100644
--- a/test/MC/ARM/simple-fp-encoding.s
+++ b/test/MC/ARM/simple-fp-encoding.s
@@ -21,9 +21,15 @@
 @ CHECK: vmul.f64 d16, d17, d16      @ encoding: [0xa0,0x0b,0x61,0xee]
         vmul.f64        d16, d17, d16
 
+@ CHECK: vmul.f64	d20, d20, d17   @ encoding: [0xa1,0x4b,0x64,0xee]
+	vmul.f64  d20, d17
+
 @ CHECK: vmul.f32 s0, s1, s0         @ encoding: [0x80,0x0a,0x20,0xee]
         vmul.f32        s0, s1, s0
 
+@ CHECK: vmul.f32	s11, s11, s21   @ encoding: [0xaa,0x5a,0x65,0xee]
+	vmul.f32  s11, s21
+
 @ CHECK: vnmul.f64 d16, d17, d16     @ encoding: [0xe0,0x0b,0x61,0xee]
         vnmul.f64       d16, d17, d16
 
@@ -127,6 +133,16 @@
         vmovne  s0, r0
         vmoveq  s0, r1
 
+        vmov.f32 r1, s2
+        vmov.f32 s4, r3
+        vmov.f64 r1, r5, d2
+        vmov.f64 d4, r3, r9
+
+@ CHECK: vmov	r1, s2                  @ encoding: [0x10,0x1a,0x11,0xee]
+@ CHECK: vmov	s4, r3                  @ encoding: [0x10,0x3a,0x02,0xee]
+@ CHECK: vmov	r1, r5, d2              @ encoding: [0x12,0x1b,0x55,0xec]
+@ CHECK: vmov	d4, r3, r9              @ encoding: [0x14,0x3b,0x49,0xec]
+
 @ CHECK: vmrs r0, fpscr              @ encoding: [0x10,0x0a,0xf1,0xee]
         vmrs    r0, fpscr
 @ CHECK: vmrs  r0, fpexc             @ encoding: [0x10,0x0a,0xf8,0xee]
diff --git a/test/MC/Disassembler/ARM/neon.txt b/test/MC/Disassembler/ARM/neon.txt
index 73bcd37..e4346ec 100644
--- a/test/MC/Disassembler/ARM/neon.txt
+++ b/test/MC/Disassembler/ARM/neon.txt
@@ -1863,3 +1863,9 @@
 # CHECK: vld1.32	{d22, d23, d24, d25}, [pc, :64]!
 0x9d 0xaa 0x41 0xf4
 # CHECK: vst1.32	{d26, d27}, [r1, :64]!
+
+0x10 0x0f 0x83 0xf2
+0x50 0x0f 0x83 0xf2
+# CHECK: vmov.f32	d0, #1.600000e+01
+# CHECK: vmov.f32	q0, #1.600000e+01
+
diff --git a/test/MC/X86/x86-64.s b/test/MC/X86/x86-64.s
index 6c836fc..4ec579a 100644
--- a/test/MC/X86/x86-64.s
+++ b/test/MC/X86/x86-64.s
@@ -459,6 +459,7 @@ cwtl  // CHECK: cwtl
 cbw   // CHECK: cbtw
 cwd   // CHECK: cwtd
 cdq   // CHECK: cltd
+cqo   // CHECK: cqto
 
 // rdar://8456378 and PR7557 - fstsw
 fstsw %ax
diff --git a/test/Makefile b/test/Makefile
index 62b0973..1bf2874 100644
--- a/test/Makefile
+++ b/test/Makefile
@@ -121,11 +121,6 @@ DSYMUTIL=dsymutil
 else
 DSYMUTIL=true
 endif
-ifdef TargetCommonOpts
-BUGPOINT_TOPTS="-gcc-tool-args $(TargetCommonOpts)"
-else
-BUGPOINT_TOPTS=""
-endif
 
 ifneq ($(OCAMLOPT),)
 CC_FOR_OCAMLOPT := $(shell $(OCAMLOPT) -config | grep native_c_compiler | sed -e 's/native_c_compiler: //')
diff --git a/test/Transforms/ConstProp/bswap.ll b/test/Transforms/ConstProp/bswap.ll
index 9fce309..a68fdcd 100644
--- a/test/Transforms/ConstProp/bswap.ll
+++ b/test/Transforms/ConstProp/bswap.ll
@@ -1,6 +1,6 @@
 ; bswap should be constant folded when it is passed a constant argument
 
-; RUN: opt < %s -constprop -S | not grep call
+; RUN: opt < %s -constprop -S | FileCheck %s
 
 declare i16 @llvm.bswap.i16(i16)
 
@@ -8,18 +8,34 @@ declare i32 @llvm.bswap.i32(i32)
 
 declare i64 @llvm.bswap.i64(i64)
 
+declare i80 @llvm.bswap.i80(i80)
+
+; CHECK: define i16 @W
 define i16 @W() {
+        ; CHECK: ret i16 256
         %Z = call i16 @llvm.bswap.i16( i16 1 )          ; <i16> [#uses=1]
         ret i16 %Z
 }
 
+; CHECK: define i32 @X
 define i32 @X() {
+        ; CHECK: ret i32 16777216
         %Z = call i32 @llvm.bswap.i32( i32 1 )          ; <i32> [#uses=1]
         ret i32 %Z
 }
 
+; CHECK: define i64 @Y
 define i64 @Y() {
+        ; CHECK: ret i64 72057594037927936
         %Z = call i64 @llvm.bswap.i64( i64 1 )          ; <i64> [#uses=1]
         ret i64 %Z
 }
 
+; CHECK: define i80 @Z
+define i80 @Z() {
+        ; CHECK: ret i80 -450681596205739728166896
+        ;                0xA0908070605040302010
+        %Z = call i80 @llvm.bswap.i80( i80 76151636403560493650080 )
+        ;                                  0x102030405060708090A0
+        ret i80 %Z
+}
diff --git a/test/Transforms/DeadStoreElimination/pr11390.ll b/test/Transforms/DeadStoreElimination/pr11390.ll
new file mode 100644
index 0000000..2ce6eea
--- /dev/null
+++ b/test/Transforms/DeadStoreElimination/pr11390.ll
@@ -0,0 +1,38 @@
+; RUN: opt -basicaa -dse -S -o - %s | FileCheck %s
+; PR11390
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+define fastcc void @cat_domain(i8* nocapture %name, i8* nocapture %domain, i8** 
+nocapture %s) nounwind uwtable {
+entry:
+  %call = tail call i64 @strlen(i8* %name) nounwind readonly
+  %call1 = tail call i64 @strlen(i8* %domain) nounwind readonly
+  %add = add i64 %call, 1
+  %add2 = add i64 %add, %call1
+  %add3 = add i64 %add2, 1
+  %call4 = tail call noalias i8* @malloc(i64 %add3) nounwind
+  store i8* %call4, i8** %s, align 8
+  %tobool = icmp eq i8* %call4, null
+  br i1 %tobool, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %call4, i8* %name, i64 %call, i32 1, i1 false)
+  %arrayidx = getelementptr inbounds i8* %call4, i64 %call
+  store i8 46, i8* %arrayidx, align 1
+; CHECK: store i8 46
+  %add.ptr5 = getelementptr inbounds i8* %call4, i64 %add
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %add.ptr5, i8* %domain, i64 %call1, i32 1, i1 false)
+  %arrayidx8 = getelementptr inbounds i8* %call4, i64 %add2
+  store i8 0, i8* %arrayidx8, align 1
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  ret void
+}
+
+declare i64 @strlen(i8* nocapture) nounwind readonly
+
+declare noalias i8* @malloc(i64) nounwind
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/Transforms/GVN/rle.ll b/test/Transforms/GVN/rle.ll
index 47ccd85..9e08004 100644
--- a/test/Transforms/GVN/rle.ll
+++ b/test/Transforms/GVN/rle.ll
@@ -26,6 +26,15 @@ define i8 @crash0({i32, i32} %A, {i32, i32}* %P) {
   ret i8 %Y
 }
 
+;; No PR filed, crashed in CaptureTracker.
+declare void @helper()
+define void @crash1() {
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* undef, i8* undef, i64 undef, i32 1, i1 false) nounwind
+  %tmp = load i8* bitcast (void ()* @helper to i8*)
+  %x = icmp eq i8 %tmp, 15
+  ret void
+}
+
 
 ;;===----------------------------------------------------------------------===;;
 ;; Store -> Load  and  Load -> Load forwarding where src and dst are different
diff --git a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
index 77354f7..a8b706e 100644
--- a/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
+++ b/test/Transforms/IndVarSimplify/2011-09-10-widen-nsw.ll
@@ -3,6 +3,9 @@
 ; add219 should be extended to i64 because it is nsw, even though its
 ; sext cannot be hoisted outside the loop.
 
+; FIXME: GetExtendedOperandRecurrence has problems with the nsw bit on add exprs
+; XFAIL: *
+
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 
 define void @test() nounwind {
diff --git a/test/Transforms/IndVarSimplify/2011-11-15-multiexit.ll b/test/Transforms/IndVarSimplify/2011-11-15-multiexit.ll
new file mode 100644
index 0000000..c74d04e
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2011-11-15-multiexit.ll
@@ -0,0 +1,40 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+;
+; Prior to the fix for PR11375, indvars would replace %firstIV with a
+; loop-invariant gep computed in the preheader. This was incorrect
+; because it was based on the minimum "ExitNotTaken" count. If the
+; final loop test is skipped (odd number of elements) then the early
+; exit would be taken and the loop invariant value would be incorrect.
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-darwin"
+
+; CHECK: if.end:
+; CHECK: phi i32* [ %first.lcssa, %early.exit ]
+define i32 @test(i32* %first, i32* %last) uwtable ssp {
+entry:
+  br i1 undef, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  br i1 undef, label %if.end, label %do.body
+
+do.body:                                          ; preds = %if.else, %if.then
+  %firstIV = phi i32* [ %incdec.ptr2, %if.else ], [ %first, %if.then ]
+  %incdec.ptr1 = getelementptr inbounds i32* %firstIV, i64 1
+  %cmp1 = icmp eq i32* %incdec.ptr1, %last
+  br i1 %cmp1, label %early.exit, label %if.else
+
+if.else:                                        ; preds = %do.body
+  %incdec.ptr2 = getelementptr inbounds i32* %firstIV, i64 2
+  %cmp2 = icmp eq i32* %incdec.ptr2, %last
+  br i1 %cmp2, label %if.end, label %do.body
+
+early.exit:
+  %first.lcssa = phi i32* [ %firstIV, %do.body ]
+  br label %if.end
+
+if.end:
+  %tmp = phi i32* [ %first.lcssa, %early.exit ], [ %first, %if.then ], [ %first, %entry ], [ undef, %if.else ]
+  %val = load i32* %tmp
+  ret i32 %val
+}
diff --git a/test/Transforms/IndVarSimplify/2011-11-17-selfphi.ll b/test/Transforms/IndVarSimplify/2011-11-17-selfphi.ll
new file mode 100644
index 0000000..ccf2595
--- /dev/null
+++ b/test/Transforms/IndVarSimplify/2011-11-17-selfphi.ll
@@ -0,0 +1,29 @@
+; RUN: opt < %s -indvars -S | FileCheck %s
+; PR11350: Check that SimplifyIndvar handles a cycle of useless self-phis.
+
+; CHECK: @test
+; CHECK-NOT: lcssa = phi
+define void @test() nounwind {
+entry:
+  br label %for.cond.preheader
+
+for.cond.preheader:                               ; preds = %entry
+  br label %for.cond.outer
+
+for.cond.outer:                                   ; preds = %for.cond.preheader, %for.end
+  %p_41.addr.0.ph = phi i32 [ %p_41.addr.1.lcssa, %for.end ], [ 1, %for.cond.preheader ]
+  br label %for.cond
+
+for.cond:
+  br i1 true, label %for.end, label %for.ph
+
+for.ph:                                   ; preds = %for.cond4.preheader
+  br label %for.end
+
+for.end:
+  %p_41.addr.1.lcssa = phi i32 [ undef, %for.ph ], [ %p_41.addr.0.ph, %for.cond ]
+  %p_68.lobit.i = lshr i32 %p_41.addr.1.lcssa, 31
+  %cmp7 = icmp eq i32 %p_41.addr.1.lcssa, 0
+  %conv8 = zext i1 %cmp7 to i32
+  br label %for.cond.outer
+}
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate10.ll b/test/Transforms/IndVarSimplify/loop_evaluate10.ll
index 269478a..c3619f6 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate10.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate10.ll
@@ -1,8 +1,14 @@
 ; RUN: opt < %s -indvars -S \
 ; RUN:   | grep {%b.1 = phi i32 \\\[ 2, %bb \\\], \\\[ 1, %bb2 \\\]}
-
+;
 ; This loop has multiple exits, and the value of %b1 depends on which
 ; exit is taken. Indvars should correctly compute the exit values.
+;
+; XFAIL: *
+; Indvars does not currently replace loop invariant values unless all
+; loop exits have the same exit value. We could handle some cases,
+; such as this, by making getSCEVAtScope() sensitive to a particular
+; loop exit.  See PR11388.
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux-gnu"
diff --git a/test/Transforms/IndVarSimplify/loop_evaluate9.ll b/test/Transforms/IndVarSimplify/loop_evaluate9.ll
index 8184a73..9f3bcaf 100644
--- a/test/Transforms/IndVarSimplify/loop_evaluate9.ll
+++ b/test/Transforms/IndVarSimplify/loop_evaluate9.ll
@@ -2,8 +2,13 @@
 ; RUN: grep {\[%\]tmp7 = icmp eq i8 -28, -28} %t
 ; RUN: grep {\[%\]tmp8 = icmp eq i8 63, 63} %t
 ; PR4477
-
 ; Indvars should compute the exit values in loop.
+;
+; XFAIL: *
+; Indvars does not currently replace loop invariant values unless all
+; loop exits have the same exit value. We could handle some cases,
+; such as this, by making getSCEVAtScope() sensitive to a particular
+; loop exit.  See PR11388.
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
diff --git a/test/Transforms/InstSimplify/2011-11-23-MaskedBitsCrash.ll b/test/Transforms/InstSimplify/2011-11-23-MaskedBitsCrash.ll
new file mode 100644
index 0000000..6166536
--- /dev/null
+++ b/test/Transforms/InstSimplify/2011-11-23-MaskedBitsCrash.ll
@@ -0,0 +1,17 @@
+; RUN: opt < %s -instsimplify
+
+; The mul can be proved to always overflow (turning a negative value
+; into a positive one) and thus results in undefined behaviour.  At
+; the same time we were deducing from the nsw flag that that mul could
+; be assumed to have a negative value (since if not it has an undefined
+; value, which can be taken to be negative).  We were reporting the mul
+; as being both positive and negative, firing an assertion!
+define i1 @test1(i32 %a) {
+entry:
+  %0 = or i32 %a, 1
+  %1 = shl i32 %0, 31
+  %2 = mul nsw i32 %1, 4
+  %3 = and i32 %2, -4
+  %4 = icmp ne i32 %3, 0
+  ret i1 %4
+}
diff --git a/test/Transforms/LoopUnroll/unloop.ll b/test/Transforms/LoopUnroll/unloop.ll
index 217c8ce..5a9cacd 100644
--- a/test/Transforms/LoopUnroll/unloop.ll
+++ b/test/Transforms/LoopUnroll/unloop.ll
@@ -427,3 +427,44 @@ if.end2413:                                       ; preds = %defchar
 return:                                           ; preds = %sw.bb304
   ret void
 }
+
+; PR11335: the most deeply nested block should be removed from the outer loop.
+; CHECK: @removeSubloopBlocks2
+; CHECK: for.cond3:
+; CHECK-NOT: br
+; CHECK: ret void
+define void @removeSubloopBlocks2() nounwind {
+entry:
+  %tobool.i = icmp ne i32 undef, 0
+  br label %lbl_616
+
+lbl_616.loopexit:                                 ; preds = %for.cond
+  br label %lbl_616
+
+lbl_616:                                          ; preds = %lbl_616.loopexit, %entry
+  br label %for.cond
+
+for.cond:                                         ; preds = %for.cond3, %lbl_616
+  br i1 false, label %for.cond1.preheader, label %lbl_616.loopexit
+
+for.cond1.preheader:                              ; preds = %for.cond
+  br label %for.cond1
+
+for.cond1.loopexit:                               ; preds = %for.cond.i
+  br label %for.cond1
+
+for.cond1:                                        ; preds = %for.cond1.loopexit, %for.cond1.preheader
+  br i1 false, label %for.body2, label %for.cond3
+
+for.body2:                                        ; preds = %for.cond1
+  br label %for.cond.i
+
+for.cond.i:                                       ; preds = %for.cond.i, %for.body2
+  br i1 %tobool.i, label %for.cond.i, label %for.cond1.loopexit
+
+for.cond3:                                        ; preds = %for.cond1
+  br i1 false, label %for.cond, label %if.end
+
+if.end:                                           ; preds = %for.cond3
+  ret void
+}
diff --git a/test/Transforms/SimplifyLibCalls/osx-names.ll b/test/Transforms/SimplifyLibCalls/osx-names.ll
new file mode 100644
index 0000000..e321d1d
--- /dev/null
+++ b/test/Transforms/SimplifyLibCalls/osx-names.ll
@@ -0,0 +1,30 @@
+; RUN: opt < %s -simplify-libcalls -S | FileCheck %s
+; <rdar://problem/9815881>
+; On OSX x86-32, fwrite and fputs aren't called fwrite and fputs.
+; Make sure we use the correct names.
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.7.2"
+
+%struct.__sFILE = type { i8*, i32, i32, i16, i16, %struct.__sbuf, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.__sbuf, %struct.__sFILEX*, i32, [3 x i8], [1 x i8], %struct.__sbuf, i32, i64 }
+%struct.__sbuf = type { i8*, i32 }
+%struct.__sFILEX = type opaque
+
+@.str = private unnamed_addr constant [13 x i8] c"Hello world\0A\00", align 1
+@.str2 = private unnamed_addr constant [3 x i8] c"%s\00", align 1
+
+define void @test1(%struct.__sFILE* %stream) nounwind {
+; CHECK: define void @test1
+; CHECK: call i32 @"fwrite$UNIX2003"
+  %call = tail call i32 (%struct.__sFILE*, i8*, ...)* @fprintf(%struct.__sFILE* %stream, i8* getelementptr inbounds ([13 x i8]* @.str, i32 0, i32 0)) nounwind
+  ret void
+}
+
+define void @test2(%struct.__sFILE* %stream, i8* %str) nounwind ssp {
+; CHECK: define void @test2
+; CHECK: call i32 @"fputs$UNIX2003"
+  %call = tail call i32 (%struct.__sFILE*, i8*, ...)* @fprintf(%struct.__sFILE* %stream, i8* getelementptr inbounds ([3 x i8]* @.str2, i32 0, i32 0), i8* %str) nounwind
+  ret void
+}
+
+declare i32 @fprintf(%struct.__sFILE*, i8*, ...) nounwind
diff --git a/tools/bugpoint/ExtractFunction.cpp b/tools/bugpoint/ExtractFunction.cpp
index 73b65ca..f2e61f8 100644
--- a/tools/bugpoint/ExtractFunction.cpp
+++ b/tools/bugpoint/ExtractFunction.cpp
@@ -340,7 +340,7 @@ Module *BugDriver::ExtractMappedBlocksFromModule(const
     // If the BB doesn't have a name, give it one so we have something to key
     // off of.
     if (!BB->hasName()) BB->setName("tmpbb");
-    BlocksToNotExtractFile.os() << BB->getParent()->getNameStr() << " "
+    BlocksToNotExtractFile.os() << BB->getParent()->getName() << " "
                                 << BB->getName() << "\n";
   }
   BlocksToNotExtractFile.os().close();
diff --git a/tools/gold/Makefile b/tools/gold/Makefile
index 78eda03..02f66d7 100644
--- a/tools/gold/Makefile
+++ b/tools/gold/Makefile
@@ -24,6 +24,6 @@ include $(LEVEL)/Makefile.config
 # Because off_t is used in the public API, the largefile parts are required for
 # ABI compatibility.
 CXXFLAGS += -I$(BINUTILS_INCDIR) -D_LARGEFILE_SOURCE -D_FILE_OFFSET_BITS=64
-CXXFLAGS += $(SharedLibDir)/$(SharedPrefix)LTO$(SHLIBEXT)
+CXXFLAGS += -L$(SharedLibDir)/$(SharedPrefix) -lLTO
 
 include $(LEVEL)/Makefile.common
diff --git a/tools/llc/llc.cpp b/tools/llc/llc.cpp
index 1694e01..fdc6fec 100644
--- a/tools/llc/llc.cpp
+++ b/tools/llc/llc.cpp
@@ -306,10 +306,22 @@ int main(int argc, char **argv) {
     FeaturesStr = Features.getString();
   }
 
+  CodeGenOpt::Level OLvl = CodeGenOpt::Default;
+  switch (OptLevel) {
+  default:
+    errs() << argv[0] << ": invalid optimization level.\n";
+    return 1;
+  case ' ': break;
+  case '0': OLvl = CodeGenOpt::None; break;
+  case '1': OLvl = CodeGenOpt::Less; break;
+  case '2': OLvl = CodeGenOpt::Default; break;
+  case '3': OLvl = CodeGenOpt::Aggressive; break;
+  }
+
   std::auto_ptr<TargetMachine>
     target(TheTarget->createTargetMachine(TheTriple.getTriple(),
                                           MCPU, FeaturesStr,
-                                          RelocModel, CMModel));
+                                          RelocModel, CMModel, OLvl));
   assert(target.get() && "Could not allocate target machine!");
   TargetMachine &Target = *target.get();
 
@@ -332,18 +344,6 @@ int main(int argc, char **argv) {
     (GetOutputStream(TheTarget->getName(), TheTriple.getOS(), argv[0]));
   if (!Out) return 1;
 
-  CodeGenOpt::Level OLvl = CodeGenOpt::Default;
-  switch (OptLevel) {
-  default:
-    errs() << argv[0] << ": invalid optimization level.\n";
-    return 1;
-  case ' ': break;
-  case '0': OLvl = CodeGenOpt::None; break;
-  case '1': OLvl = CodeGenOpt::Less; break;
-  case '2': OLvl = CodeGenOpt::Default; break;
-  case '3': OLvl = CodeGenOpt::Aggressive; break;
-  }
-
   // Build up all of the passes that we want to do to the module.
   PassManager PM;
 
@@ -368,7 +368,7 @@ int main(int argc, char **argv) {
     formatted_raw_ostream FOS(Out->os());
 
     // Ask the target to add backend passes as necessary.
-    if (Target.addPassesToEmitFile(PM, FOS, FileType, OLvl, NoVerify)) {
+    if (Target.addPassesToEmitFile(PM, FOS, FileType, NoVerify)) {
       errs() << argv[0] << ": target does not support generation of this"
              << " file type!\n";
       return 1;
diff --git a/tools/llvm-objdump/llvm-objdump.cpp b/tools/llvm-objdump/llvm-objdump.cpp
index d7c5492..9071d58 100644
--- a/tools/llvm-objdump/llvm-objdump.cpp
+++ b/tools/llvm-objdump/llvm-objdump.cpp
@@ -556,8 +556,10 @@ static void DumpArchive(const Archive *a) {
                                e = a->end_children(); i != e; ++i) {
     OwningPtr<Binary> child;
     if (error_code ec = i->getAsBinary(child)) {
-      errs() << ToolName << ": '" << a->getFileName() << "': " << ec.message()
-             << ".\n";
+      // Ignore non-object files.
+      if (ec != object_error::invalid_file_type)
+        errs() << ToolName << ": '" << a->getFileName() << "': " << ec.message()
+               << ".\n";
       continue;
     }
     if (ObjectFile *o = dyn_cast<ObjectFile>(child.get()))
diff --git a/tools/llvm-prof/llvm-prof.cpp b/tools/llvm-prof/llvm-prof.cpp
index 9d0b468..d9b6713 100644
--- a/tools/llvm-prof/llvm-prof.cpp
+++ b/tools/llvm-prof/llvm-prof.cpp
@@ -200,9 +200,9 @@ bool ProfileInfoPrinterPass::runOnModule(Module &M) {
     }
 
     outs() << format("%3d", i+1) << ". "
-      << format("%5.2g", FunctionCounts[i].second) << "/"
-      << format("%g", TotalExecutions) << " "
-      << FunctionCounts[i].first->getNameStr() << "\n";
+           << format("%5.2g", FunctionCounts[i].second) << "/"
+           << format("%g", TotalExecutions) << " "
+           << FunctionCounts[i].first->getName() << "\n";
   }
 
   std::set<Function*> FunctionsToPrint;
@@ -225,12 +225,12 @@ bool ProfileInfoPrinterPass::runOnModule(Module &M) {
   for (unsigned i = 0; i != BlocksToPrint; ++i) {
     if (Counts[i].second == 0) break;
     Function *F = Counts[i].first->getParent();
-    outs() << format("%3d", i+1) << ". " 
-      << format("%5g", Counts[i].second/(double)TotalExecutions*100) << "% "
-      << format("%5.0f", Counts[i].second) << "/"
-      << format("%g", TotalExecutions) << "\t"
-      << F->getNameStr() << "() - "
-       << Counts[i].first->getNameStr() << "\n";
+    outs() << format("%3d", i+1) << ". "
+           << format("%5g", Counts[i].second/(double)TotalExecutions*100)<<"% "
+           << format("%5.0f", Counts[i].second) << "/"
+           << format("%g", TotalExecutions) << "\t"
+           << F->getName() << "() - "
+           << Counts[i].first->getName() << "\n";
     FunctionsToPrint.insert(F);
   }
 
diff --git a/tools/opt/PrintSCC.cpp b/tools/opt/PrintSCC.cpp
index 533f49e..11efdcd 100644
--- a/tools/opt/PrintSCC.cpp
+++ b/tools/opt/PrintSCC.cpp
@@ -101,8 +101,8 @@ bool CallGraphSCC::runOnModule(Module &M) {
     errs() << "\nSCC #" << ++sccNum << " : ";
     for (std::vector<CallGraphNode*>::const_iterator I = nextSCC.begin(),
            E = nextSCC.end(); I != E; ++I)
-      errs() << ((*I)->getFunction() ? (*I)->getFunction()->getNameStr()
-                 : std::string("external node")) << ", ";
+      errs() << ((*I)->getFunction() ? (*I)->getFunction()->getName()
+                                     : "external node") << ", ";
     if (nextSCC.size() == 1 && SCCI.hasLoop())
       errs() << " (Has self-loop).";
   }
diff --git a/tools/opt/opt.cpp b/tools/opt/opt.cpp
index 8be4e83..fe73462 100644
--- a/tools/opt/opt.cpp
+++ b/tools/opt/opt.cpp
@@ -291,8 +291,8 @@ struct RegionPassPrinter : public RegionPass {
   virtual bool runOnRegion(Region *R, RGPassManager &RGM) {
     if (!Quiet) {
       Out << "Printing analysis '" << PassToPrint->getPassName() << "' for "
-        << "region: '" << R->getNameStr() << "' in function '"
-        << R->getEntry()->getParent()->getNameStr() << "':\n";
+          << "region: '" << R->getNameStr() << "' in function '"
+          << R->getEntry()->getParent()->getName() << "':\n";
     }
     // Get and print pass...
    getAnalysisID<Pass>(PassToPrint->getTypeInfo()).print(Out,
diff --git a/utils/TableGen/CodeGenTarget.cpp b/utils/TableGen/CodeGenTarget.cpp
index 4a7bad7..6e1872e 100644
--- a/utils/TableGen/CodeGenTarget.cpp
+++ b/utils/TableGen/CodeGenTarget.cpp
@@ -90,7 +90,7 @@ std::string llvm::getEnumName(MVT::SimpleValueType T) {
   case MVT::Metadata: return "MVT::Metadata";
   case MVT::iPTR:     return "MVT::iPTR";
   case MVT::iPTRAny:  return "MVT::iPTRAny";
-  case MVT::untyped:  return "MVT::untyped";
+  case MVT::Untyped:  return "MVT::Untyped";
   default: assert(0 && "ILLEGAL VALUE TYPE!"); return "";
   }
 }
diff --git a/utils/TableGen/EDEmitter.cpp b/utils/TableGen/EDEmitter.cpp
index effb71b..1953dad 100644
--- a/utils/TableGen/EDEmitter.cpp
+++ b/utils/TableGen/EDEmitter.cpp
@@ -607,6 +607,7 @@ static int ARMFlagFromOpName(LiteralConstantEmitter *type,
   IMM("nImmSplatI32");
   IMM("nImmSplatI64");
   IMM("nImmVMOVI32");
+  IMM("nImmVMOVF32");
   IMM("imm0_7");
   IMM("imm0_15");
   IMM("imm0_255");
diff --git a/utils/TableGen/X86RecognizableInstr.cpp b/utils/TableGen/X86RecognizableInstr.cpp
index 3478809..ce8ea1a 100644
--- a/utils/TableGen/X86RecognizableInstr.cpp
+++ b/utils/TableGen/X86RecognizableInstr.cpp
@@ -392,18 +392,11 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
     
   // Filter out artificial instructions
     
-  if (Name.find("TAILJMP") != Name.npos    ||
-      Name.find("_Int") != Name.npos       ||
-      Name.find("_int") != Name.npos       ||
+  if (Name.find("_Int") != Name.npos       ||
       Name.find("Int_") != Name.npos       ||
       Name.find("_NOREX") != Name.npos     ||
-      Name.find("_TC") != Name.npos        ||
-      Name.find("EH_RETURN") != Name.npos  ||
-      Name.find("V_SET") != Name.npos      ||
-      Name.find("LOCK_") != Name.npos      ||
-      Name.find("WIN") != Name.npos        ||
-      Name.find("_AVX") != Name.npos       ||
-      Name.find("2SDL") != Name.npos)
+      Name.find("2SDL") != Name.npos       ||
+      Name == "LOCK_PREFIX")
     return FILTER_STRONG;
 
   // Filter out instructions with segment override prefixes.
@@ -440,12 +433,6 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
       Name.find("Xrr") != Name.npos ||
       Name.find("rr64") != Name.npos)
     return FILTER_WEAK;
-    
-  if (Name == "VMASKMOVDQU64"  ||
-      Name == "VEXTRACTPSrr64" ||
-      Name == "VMOVQd64rr"     ||
-      Name == "VMOVQs64rr")
-    return FILTER_WEAK;
 
   // Special cases.
 
@@ -460,29 +447,15 @@ RecognizableInstr::filter_ret RecognizableInstr::filter() const {
     return FILTER_WEAK;
   if (Name.find("Fs") != Name.npos)
     return FILTER_WEAK;
-  if (Name == "MOVLPDrr"          ||
-      Name == "MOVLPSrr"          ||
-      Name == "PUSHFQ"            ||
-      Name == "BSF16rr"           ||
-      Name == "BSF16rm"           ||
-      Name == "BSR16rr"           ||
-      Name == "BSR16rm"           ||
-      Name == "MOVSX16rm8"        ||
-      Name == "MOVSX16rr8"        ||
-      Name == "MOVZX16rm8"        ||
-      Name == "MOVZX16rr8"        ||
-      Name == "PUSH32i16"         ||
-      Name == "PUSH64i16"         ||
+  if (Name == "PUSH64i16"         ||
       Name == "MOVPQI2QImr"       ||
       Name == "VMOVPQI2QImr"      ||
-      Name == "MOVSDmr"           ||
-      Name == "MOVSDrm"           ||
-      Name == "MOVSSmr"           ||
-      Name == "MOVSSrm"           ||
       Name == "MMX_MOVD64rrv164"  ||
-      Name == "CRC32m16"          ||
       Name == "MOV64ri64i32"      ||
-      Name == "CRC32r16")
+      Name == "VMASKMOVDQU64"     ||
+      Name == "VEXTRACTPSrr64"    ||
+      Name == "VMOVQd64rr"        ||
+      Name == "VMOVQs64rr")
     return FILTER_WEAK;
 
   if (HasFROperands && Name.find("MOV") != Name.npos &&
diff --git a/utils/llvm-build/llvmbuild/main.py b/utils/llvm-build/llvmbuild/main.py
index fa76aa5..21cf392 100644
--- a/utils/llvm-build/llvmbuild/main.py
+++ b/utils/llvm-build/llvmbuild/main.py
@@ -21,6 +21,21 @@ def cmake_quote_string(value):
 
     return value
 
+def cmake_quote_path(value):
+    """
+    cmake_quote_path(value) -> str
+
+    Return a quoted form of the given value that is suitable for use in CMake
+    language files.
+    """
+
+    # CMake has a bug in it's Makefile generator that doesn't properly quote
+    # strings it generates. So instead of using proper quoting, we just use "/"
+    # style paths.  Currently, we only handle escaping backslashes.
+    value = value.replace("\\", "/")
+
+    return value
+
 def mk_quote_string_for_target(value):
     """
     mk_quote_string_for_target(target_name) -> str
@@ -430,7 +445,7 @@ class LLVMProjectInfo(object):
             print >>f, """\
 configure_file(\"%s\"
                ${CMAKE_CURRENT_BINARY_DIR}/DummyConfigureOutput)""" % (
-                cmake_quote_string(dep),)
+                cmake_quote_path(dep),)
 
         f.close()