Update to LLVM 3.5a.

Change-Id: Ifadecab779f128e62e430c2b4f6ddd84953ed617
author: Stephen Hines <srhines@google.com> 2014-04-23 16:57:46 -0700
committer: Stephen Hines <srhines@google.com> 2014-04-24 15:53:16 -0700
commit: 36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree: e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /test/CodeGen/X86
parent: 69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download: external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz
external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2
270 files changed, 10460 insertions, 1217 deletions
diff --git a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
index d906da4..1b3fc38 100644
--- a/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
+++ b/test/CodeGen/X86/2006-01-19-ISelFoldingBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | \
+; RUN: llc < %s -march=x86 -mcpu=generic | \
 ; RUN:   grep shld | count 1
 ;
 ; Check that the isel does not fold the shld, which already folds a load
diff --git a/test/CodeGen/X86/2006-07-20-InlineAsm.ll b/test/CodeGen/X86/2006-07-20-InlineAsm.ll
index cac47cd..1facf15 100644
--- a/test/CodeGen/X86/2006-07-20-InlineAsm.ll
+++ b/test/CodeGen/X86/2006-07-20-InlineAsm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 ; PR833
 
 @G = weak global i32 0		; <i32*> [#uses=3]
diff --git a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
index c4b08a3..2a9c832 100644
--- a/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
+++ b/test/CodeGen/X86/2006-07-31-SingleRegClass.ll
@@ -1,5 +1,5 @@
 ; PR850
-; RUN: llc < %s -march=x86 -x86-asm-syntax=att | FileCheck %s
+; RUN: llc < %s -march=x86 -x86-asm-syntax=att -no-integrated-as | FileCheck %s
 
 ; CHECK: {{movl 4[(]%eax[)],%ebp}}
 ; CHECK: {{movl 0[(]%eax[)], %ebx}}
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
index 3b2e443..93fb344 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmPModifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep "mov %gs:72, %eax"
+; RUN: llc < %s -march=x86 -no-integrated-as | grep "mov %gs:72, %eax"
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
 
diff --git a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
index 366f583..6cf8bf9 100644
--- a/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
+++ b/test/CodeGen/X86/2007-03-24-InlineAsmVectorOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=yonah -march=x86 | FileCheck %s
+; RUN: llc < %s -mcpu=yonah -march=x86 -no-integrated-as | FileCheck %s
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin9"
diff --git a/test/CodeGen/X86/2007-05-05-Personality.ll b/test/CodeGen/X86/2007-05-05-Personality.ll
index 7d21b71..5b8fe72 100644
--- a/test/CodeGen/X86/2007-05-05-Personality.ll
+++ b/test/CodeGen/X86/2007-05-05-Personality.ll
@@ -1,7 +1,12 @@
-; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o - | FileCheck %s
-
-; CHECK: .cfi_personality 0, __gnat_eh_personality
-; CHECK: .cfi_lsda 0, .Lexception0
+; RUN: llc < %s -mtriple=i686-pc-linux-gnu -o -     | FileCheck %s  --check-prefix=LIN
+; RUN: llc < %s -mtriple=x86_64-pc-windows-gnu -o - | FileCheck %s  --check-prefix=LIN
+; RUN: llc < %s -mtriple=i386-pc-mingw32 -o -       | FileCheck %s  --check-prefix=WIN
+; RUN: llc < %s -mtriple=i686-pc-windows-gnu -o -   | FileCheck %s  --check-prefix=WIN
+
+; LIN: .cfi_personality 0, __gnat_eh_personality
+; LIN: .cfi_lsda 0, .Lexception0
+; WIN: .cfi_personality 0, ___gnat_eh_personality
+; WIN: .cfi_lsda 0, Lexception0
 
 @error = external global i8
 
diff --git a/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll b/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
deleted file mode 100644
index 15466a1..0000000
--- a/test/CodeGen/X86/2007-09-17-ObjcFrameEH.ll
+++ /dev/null
@@ -1,67 +0,0 @@
-; RUN: llc < %s -disable-cfi -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
-
-; CHECK: "_-[NSString(local) isNullOrNil].eh":
-
-	%struct.NSString = type {  }
-	%struct._objc__method_prototype_list = type opaque
-	%struct._objc_category = type { i8*, i8*, %struct._objc_method_list*, %struct._objc_method_list*, %struct._objc_protocol**, i32, %struct._prop_list_t* }
-	%struct._objc_method = type { %struct.objc_selector*, i8*, i8* }
-	%struct._objc_method_list = type opaque
-	%struct._objc_module = type { i32, i32, i8*, %struct._objc_symtab* }
-	%struct._objc_protocol = type { %struct._objc_protocol_extension*, i8*, %struct._objc_protocol**, %struct._objc__method_prototype_list*, %struct._objc__method_prototype_list* }
-	%struct._objc_protocol_extension = type opaque
-	%struct._objc_symtab = type { i32, %struct.objc_selector**, i16, i16, [1 x i8*] }
-	%struct._prop_list_t = type opaque
-	%struct.anon = type { %struct._objc__method_prototype_list*, i32, [1 x %struct._objc_method] }
-	%struct.objc_selector = type opaque
-@"\01L_OBJC_SYMBOLS" = internal global { i32, i32, i16, i16, [1 x %struct._objc_category*] } {
-    i32 0, 
-    i32 0, 
-    i16 0, 
-    i16 1, 
-    [1 x %struct._objc_category*] [ %struct._objc_category* bitcast ({ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }* @"\01L_OBJC_CATEGORY_NSString_local" to %struct._objc_category*) ] }, section "__OBJC,__symbols,regular,no_dead_strip"		; <{ i32, i32, i16, i16, [1 x %struct._objc_category*] }*> [#uses=2]
-@"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" = internal global { i32, i32, [1 x %struct._objc_method] } {
-    i32 0, 
-    i32 1, 
-    [1 x %struct._objc_method] [ %struct._objc_method {
-        %struct.objc_selector* bitcast ([12 x i8]* @"\01L_OBJC_METH_VAR_NAME_0" to %struct.objc_selector*), 
-        i8* getelementptr ([7 x i8]* @"\01L_OBJC_METH_VAR_TYPE_0", i32 0, i32 0), 
-        i8* bitcast (i8 (%struct.NSString*, %struct.objc_selector*)  * @"-[NSString(local) isNullOrNil]" to i8*) } ] }, section "__OBJC,__cat_inst_meth,regular,no_dead_strip"		; <{ i32, i32, [1 x %struct._objc_method] }*> [#uses=3]
-@"\01L_OBJC_CATEGORY_NSString_local" = internal global { i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 } {
-    i8* getelementptr ([6 x i8]* @"\01L_OBJC_CLASS_NAME_0", i32 0, i32 0), 
-    i8* getelementptr ([9 x i8]* @"\01L_OBJC_CLASS_NAME_1", i32 0, i32 0), 
-    %struct._objc_method_list* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" to %struct._objc_method_list*), 
-    i32 0, 
-    i32 0, 
-    i32 28, 
-    i32 0 }, section "__OBJC,__category,regular,no_dead_strip"		; <{ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }*> [#uses=2]
-@"\01L_OBJC_IMAGE_INFO" = internal constant [2 x i32] zeroinitializer, section "__OBJC,__image_info,regular"		; <[2 x i32]*> [#uses=1]
-@"\01L_OBJC_MODULES" = internal global %struct._objc_module {
-    i32 7, 
-    i32 16, 
-    i8* getelementptr ([1 x i8]* @"\01L_OBJC_CLASS_NAME_2", i32 0, i32 0), 
-    %struct._objc_symtab* bitcast ({ i32, i32, i16, i16, [1 x %struct._objc_category*] }* @"\01L_OBJC_SYMBOLS" to %struct._objc_symtab*) }, section "__OBJC,__module_info,regular,no_dead_strip"		; <%struct._objc_module*> [#uses=1]
-@"\01.objc_class_ref_NSString" = internal global i8* @"\01.objc_class_name_NSString"		; <i8**> [#uses=0]
-@"\01.objc_class_name_NSString" = external global i8		; <i8*> [#uses=1]
-@"\01.objc_category_name_NSString_local" = constant i32 0		; <i32*> [#uses=1]
-@"\01L_OBJC_CLASS_NAME_2" = internal global [1 x i8] zeroinitializer, section "__TEXT,__cstring,cstring_literals"		; <[1 x i8]*> [#uses=2]
-@"\01L_OBJC_CLASS_NAME_1" = internal global [9 x i8] c"NSString\00", section "__TEXT,__cstring,cstring_literals"		; <[9 x i8]*> [#uses=2]
-@"\01L_OBJC_CLASS_NAME_0" = internal global [6 x i8] c"local\00", section "__TEXT,__cstring,cstring_literals"		; <[6 x i8]*> [#uses=2]
-@"\01L_OBJC_METH_VAR_NAME_0" = internal global [12 x i8] c"isNullOrNil\00", section "__TEXT,__cstring,cstring_literals"		; <[12 x i8]*> [#uses=3]
-@"\01L_OBJC_METH_VAR_TYPE_0" = internal global [7 x i8] c"c8@0:4\00", section "__TEXT,__cstring,cstring_literals"		; <[7 x i8]*> [#uses=2]
-@llvm.used = appending global [11 x i8*] [ i8* bitcast ({ i32, i32, i16, i16, [1 x %struct._objc_category*] }* @"\01L_OBJC_SYMBOLS" to i8*), i8* bitcast ({ i32, i32, [1 x %struct._objc_method] }* @"\01L_OBJC_CATEGORY_INSTANCE_METHODS_NSString_local" to i8*), i8* bitcast ({ i8*, i8*, %struct._objc_method_list*, i32, i32, i32, i32 }* @"\01L_OBJC_CATEGORY_NSString_local" to i8*), i8* bitcast ([2 x i32]* @"\01L_OBJC_IMAGE_INFO" to i8*), i8* bitcast (%struct._objc_module* @"\01L_OBJC_MODULES" to i8*), i8* bitcast (i32* @"\01.objc_category_name_NSString_local" to i8*), i8* getelementptr ([1 x i8]* @"\01L_OBJC_CLASS_NAME_2", i32 0, i32 0), i8* getelementptr ([9 x i8]* @"\01L_OBJC_CLASS_NAME_1", i32 0, i32 0), i8* getelementptr ([6 x i8]* @"\01L_OBJC_CLASS_NAME_0", i32 0, i32 0), i8* getelementptr ([12 x i8]* @"\01L_OBJC_METH_VAR_NAME_0", i32 0, i32 0), i8* getelementptr ([7 x i8]* @"\01L_OBJC_METH_VAR_TYPE_0", i32 0, i32 0) ], section "llvm.metadata"		; <[11 x i8*]*> [#uses=0]
-
-define internal signext i8 @"-[NSString(local) isNullOrNil]"(%struct.NSString* %self, %struct.objc_selector* %_cmd)   {
-entry:
-	%self_addr = alloca %struct.NSString*		; <%struct.NSString**> [#uses=1]
-	%_cmd_addr = alloca %struct.objc_selector*		; <%struct.objc_selector**> [#uses=1]
-	%retval = alloca i8, align 1		; <i8*> [#uses=1]
-	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
-	store %struct.NSString* %self, %struct.NSString** %self_addr
-	store %struct.objc_selector* %_cmd, %struct.objc_selector** %_cmd_addr
-	br label %return
-
-return:		; preds = %entry
-	%retval1 = load i8* %retval		; <i8> [#uses=1]
-	ret i8 %retval1
-}
diff --git a/test/CodeGen/X86/2007-10-17-IllegalAsm.ll b/test/CodeGen/X86/2007-10-17-IllegalAsm.ll
deleted file mode 100644
index c0bb55e..0000000
--- a/test/CodeGen/X86/2007-10-17-IllegalAsm.ll
+++ /dev/null
@@ -1,87 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep addb | not grep x
-; RUN: llc < %s -mtriple=x86_64-linux-gnu | grep cmpb | not grep x
-; PR1734
-
-target triple = "x86_64-unknown-linux-gnu"
-	%struct.CUMULATIVE_ARGS = type { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 }
-	%struct.eh_status = type opaque
-	%struct.emit_status = type { i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack*, i32, %struct.location_t, i32, i8*, %struct.rtx_def** }
-	%struct.expr_status = type { i32, i32, i32, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def* }
-	%struct.function = type { %struct.eh_status*, %struct.expr_status*, %struct.emit_status*, %struct.varasm_status*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.function*, i32, i32, i32, i32, %struct.rtx_def*, %struct.CUMULATIVE_ARGS, %struct.rtx_def*, %struct.rtx_def*, %struct.initial_value_struct*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, %struct.rtx_def*, i8, i32, i64, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, %struct.varray_head_tag*, %struct.temp_slot*, i32, %struct.var_refs_queue*, i32, i32, %struct.rtvec_def*, %struct.tree_node*, i32, i32, i32, %struct.machine_function*, i32, i32, i8, i8, %struct.language_function*, %struct.rtx_def*, i32, i32, i32, i32, %struct.location_t, %struct.varray_head_tag*, %struct.tree_node*, %struct.tree_node*, i8, i8, i8 }
-	%struct.initial_value_struct = type opaque
-	%struct.lang_decl = type opaque
-	%struct.language_function = type opaque
-	%struct.location_t = type { i8*, i32 }
-	%struct.machine_function = type { %struct.stack_local_entry*, i8*, %struct.rtx_def*, i32, i32, i32, i32, i32 }
-	%struct.rtunion = type { i8* }
-	%struct.rtvec_def = type { i32, [1 x %struct.rtx_def*] }
-	%struct.rtx_def = type { i16, i8, i8, %struct.u }
-	%struct.sequence_stack = type { %struct.rtx_def*, %struct.rtx_def*, %struct.sequence_stack* }
-	%struct.stack_local_entry = type opaque
-	%struct.temp_slot = type opaque
-	%struct.tree_common = type { %struct.tree_node*, %struct.tree_node*, %union.tree_ann_d*, i8, i8, i8, i8, i8 }
-	%struct.tree_decl = type { %struct.tree_common, %struct.location_t, i32, %struct.tree_node*, i8, i8, i8, i8, i8, i8, i8, i8, i32, %struct.tree_decl_u1, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.tree_node*, %struct.rtx_def*, i32, %struct.tree_decl_u2, %struct.tree_node*, %struct.tree_node*, i64, %struct.lang_decl* }
-	%struct.tree_decl_u1 = type { i64 }
-	%struct.tree_decl_u2 = type { %struct.function* }
-	%struct.tree_node = type { %struct.tree_decl }
-	%struct.u = type { [1 x %struct.rtunion] }
-	%struct.var_refs_queue = type { %struct.rtx_def*, i32, i32, %struct.var_refs_queue* }
-	%struct.varasm_status = type opaque
-	%struct.varray_data = type { [1 x i64] }
-	%struct.varray_head_tag = type { i64, i64, i32, i8*, %struct.varray_data }
-	%union.tree_ann_d = type opaque
-
-define void @layout_type(%struct.tree_node* %type) {
-entry:
-	%tmp32 = load i32* null, align 8		; <i32> [#uses=3]
-	%tmp3435 = trunc i32 %tmp32 to i8		; <i8> [#uses=1]
-	%tmp53 = icmp eq %struct.tree_node* null, null		; <i1> [#uses=1]
-	br i1 %tmp53, label %cond_next57, label %UnifiedReturnBlock
-
-cond_next57:		; preds = %entry
-	%tmp65 = and i32 %tmp32, 255		; <i32> [#uses=1]
-	switch i32 %tmp65, label %UnifiedReturnBlock [
-		 i32 6, label %bb140
-		 i32 7, label %bb140
-		 i32 8, label %bb140
-		 i32 13, label %bb478
-	]
-
-bb140:		; preds = %cond_next57, %cond_next57, %cond_next57
-	%tmp219 = load i32* null, align 8		; <i32> [#uses=1]
-	%tmp221222 = trunc i32 %tmp219 to i8		; <i8> [#uses=1]
-	%tmp223 = icmp eq i8 %tmp221222, 24		; <i1> [#uses=1]
-	br i1 %tmp223, label %cond_true226, label %cond_next340
-
-cond_true226:		; preds = %bb140
-	switch i8 %tmp3435, label %cond_true288 [
-		 i8 6, label %cond_next340
-		 i8 9, label %cond_next340
-		 i8 7, label %cond_next340
-		 i8 8, label %cond_next340
-		 i8 10, label %cond_next340
-	]
-
-cond_true288:		; preds = %cond_true226
-	unreachable
-
-cond_next340:		; preds = %cond_true226, %cond_true226, %cond_true226, %cond_true226, %cond_true226, %bb140
-	ret void
-
-bb478:		; preds = %cond_next57
-	br i1 false, label %cond_next500, label %cond_true497
-
-cond_true497:		; preds = %bb478
-	unreachable
-
-cond_next500:		; preds = %bb478
-	%tmp513 = load i32* null, align 8		; <i32> [#uses=1]
-	%tmp545 = and i32 %tmp513, 8192		; <i32> [#uses=1]
-	%tmp547 = and i32 %tmp32, -8193		; <i32> [#uses=1]
-	%tmp548 = or i32 %tmp547, %tmp545		; <i32> [#uses=1]
-	store i32 %tmp548, i32* null, align 8
-	ret void
-
-UnifiedReturnBlock:		; preds = %cond_next57, %entry
-	ret void
-}
diff --git a/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll b/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll
index 984094d..d02346d 100644
--- a/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll
+++ b/test/CodeGen/X86/2007-10-28-inlineasm-q-modifier.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s
+; RUN: llc -no-integrated-as < %s
 ; PR1748
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll b/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll
index 6b871aa..ec3bce9 100644
--- a/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll
+++ b/test/CodeGen/X86/2007-11-04-LiveVariablesBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu
+; RUN: llc -no-integrated-as < %s -mtriple=x86_64-unknown-linux-gnu
 ; PR1767
 
 define void @xor_sse_2(i64 %bytes, i64* %p1, i64* %p2) {
diff --git a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
index c467024..d1699d5 100644
--- a/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
+++ b/test/CodeGen/X86/2007-11-04-rip-immediate-constant.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | FileCheck %s
+; RUN: llc < %s -relocation-model=static -no-integrated-as | FileCheck %s
 ; PR1761
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-pc-linux"
diff --git a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
index b06b249..319e884 100644
--- a/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
+++ b/test/CodeGen/X86/2008-02-20-InlineAsmClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 ; PR2078
 ; The clobber list says that "ax" is clobbered.  Make sure that eax isn't 
 ; allocated to the input/output register.
diff --git a/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll b/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
index 0b4eb3a..11b55a6 100644
--- a/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
+++ b/test/CodeGen/X86/2008-02-26-AsmDirectMemOp.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32"
 target triple = "i386-pc-linux-gnu"
diff --git a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
index 18b3714..6b374a7 100644
--- a/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
+++ b/test/CodeGen/X86/2008-03-14-SpillerCrash.ll
@@ -6,7 +6,7 @@
 	%struct.locale_data = type { i8*, i8*, i32, i32, { void (%struct.locale_data*)*, %struct.anon }, i32, i32, i32, [0 x %struct.locale_data_value] }
 	%struct.locale_data_value = type { i32* }
 
-@wcstoll_l = alias i64 (i32*, i32**, i32, %struct.__locale_struct*)* @__wcstoll_l		; <i64 (i32*, i32**, i32, %struct.__locale_struct*)*> [#uses=0]
+@wcstoll_l = alias i64 (i32*, i32**, i32, %struct.__locale_struct*)* @__wcstoll_l
 
 define i64 @____wcstoll_l_internal(i32* %nptr, i32** %endptr, i32 %base, i32 %group, %struct.__locale_struct* %loc) nounwind  {
 entry:
diff --git a/test/CodeGen/X86/2008-04-02-unnamedEH.ll b/test/CodeGen/X86/2008-04-02-unnamedEH.ll
index ab8ec80..70812ea 100644
--- a/test/CodeGen/X86/2008-04-02-unnamedEH.ll
+++ b/test/CodeGen/X86/2008-04-02-unnamedEH.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-cfi | FileCheck %s
+; RUN: llc < %s | FileCheck %s
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
 
@@ -11,6 +11,8 @@ define internal void @""() {
 	call i32 @_Z3barv( )		; <i32>:4 [#uses=1]
 	ret void
 }
-; CHECK: unnamed_1.eh
+
+; CHECK:      ___unnamed_1:
+; CHECK-NEXT: .cfi_startproc
 
 declare i32 @_Z3barv()
diff --git a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
index 5089e8c..d439e82 100644
--- a/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
+++ b/test/CodeGen/X86/2008-04-08-CoalescerCrash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx
+; RUN: llc < %s -mtriple=i686-pc-linux -mattr=+mmx
 
 define i32 @t2() nounwind  {
 entry:
diff --git a/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll b/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll
index d4805b4..6d45f1f 100644
--- a/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll
+++ b/test/CodeGen/X86/2008-04-26-Asm-Optimize-Imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 ; rdar://5720231
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i386-apple-darwin8"
diff --git a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
index 496779c..51064f1 100644
--- a/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
+++ b/test/CodeGen/X86/2008-08-31-EH_RETURN64.ll
@@ -9,7 +9,7 @@ target triple = "x86_64-unknown-linux-gnu"
 ; CHECK: movq %rsp, %rbp
 ; CHECK: popq %rbp
 ; CHECK: movq %rcx, %rsp
-; CHECK: ret # eh_return, addr: %rcx
+; CHECK: retq # eh_return, addr: %rcx
 define i8* @test(i64 %a, i8* %b)  {
 entry:
   call void @llvm.eh.unwind.init()
diff --git a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
index 5c2fbee..f4a43a1 100644
--- a/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
+++ b/test/CodeGen/X86/2008-09-18-inline-asm-2.ll
@@ -1,6 +1,6 @@
-; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=basic      | FileCheck %s
-; RUN: llc < %s -march=x86 -regalloc=greedy     | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=fast -optimize-regalloc=0 -no-integrated-as | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=basic -no-integrated-as      | FileCheck %s
+; RUN: llc < %s -march=x86 -regalloc=greedy -no-integrated-as     | FileCheck %s
 
 ; The 1st, 2nd, 3rd and 5th registers must all be different.  The registers
 ; referenced in the 4th and 6th operands must not be the same as the 1st or 5th
diff --git a/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll b/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
index b2e6061..2b2f704 100644
--- a/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
+++ b/test/CodeGen/X86/2008-10-17-Asm64bitRConstraint.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -march=x86-64 -no-integrated-as
 
 define void @test(i64 %x) nounwind {
 entry:
diff --git a/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll b/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
index 353d1c7..e23dfe5 100644
--- a/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
+++ b/test/CodeGen/X86/2008-10-20-AsmDoubleInI32.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -march=x86
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86 -no-integrated-as
+; RUN: llc < %s -march=x86-64 -no-integrated-as
 
 ; from gcc.c-torture/compile/920520-1.c
 
diff --git a/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll b/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
deleted file mode 100644
index 2e27811..0000000
--- a/test/CodeGen/X86/2008-12-12-PrivateEHSymbol.ll
+++ /dev/null
@@ -1,10 +0,0 @@
-; RUN: llc < %s -disable-cfi -march=x86-64 -mtriple=x86_64-apple-darwin9 | grep ^__Z1fv.eh
-; RUN: llc < %s -disable-cfi -march=x86    -mtriple=i386-apple-darwin9 | grep ^__Z1fv.eh
-
-define void @_Z1fv() {
-entry:
-	br label %return
-
-return:
-	ret void
-}
diff --git a/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll b/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
index 7549651..5004f04 100644
--- a/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
+++ b/test/CodeGen/X86/2009-02-12-InlineAsm-nieZ-constraints.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -no-integrated-as | FileCheck %s
 
 ; ModuleID = 'shant.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
diff --git a/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll b/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll
index 3d70b58..bd1b47a 100644
--- a/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll
+++ b/test/CodeGen/X86/2009-04-13-2AddrAssert-2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin
+; RUN: llc < %s -mtriple=i386-apple-darwin -no-integrated-as
 ; rdar://6781755
 ; PR3934
 
diff --git a/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll b/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll
index 7468acb..fa240f6 100644
--- a/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll
+++ b/test/CodeGen/X86/2009-05-08-InlineAsmIOffset.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=static | FileCheck %s
+; RUN: llc < %s -relocation-model=static -no-integrated-as | FileCheck %s
 ; PR4152
 
 ; CHECK: {{1: ._pv_cpu_ops[+]8}}
diff --git a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
index 5f5d5cc..50c62df 100644
--- a/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
+++ b/test/CodeGen/X86/2009-06-05-VZextByteShort.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+mmx,+sse2 > %t1
+; RUN: llc < %s -march=x86 -mcpu=core2 > %t1
 ; RUN: grep movzwl %t1 | count 2
 ; RUN: grep movzbl %t1 | count 1
 ; RUN: grep movd %t1 | count 4
diff --git a/test/CodeGen/X86/2009-08-23-linkerprivate.ll b/test/CodeGen/X86/2009-08-23-linkerprivate.ll
deleted file mode 100644
index 90fac15..0000000
--- a/test/CodeGen/X86/2009-08-23-linkerprivate.ll
+++ /dev/null
@@ -1,8 +0,0 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-apple-darwin | FileCheck %s
-
-; ModuleID = '/Volumes/MacOS9/tests/WebKit/JavaScriptCore/profiler/ProfilerServer.mm'
-
-@"\01l_objc_msgSend_fixup_alloc" = linker_private_weak hidden global i32 0, section "__DATA, __objc_msgrefs, coalesced", align 16
-
-; CHECK: .globl l_objc_msgSend_fixup_alloc
-; CHECK: .weak_definition l_objc_msgSend_fixup_alloc
diff --git a/test/CodeGen/X86/2009-09-19-earlyclobber.ll b/test/CodeGen/X86/2009-09-19-earlyclobber.ll
index 66f5118..7df62fd 100644
--- a/test/CodeGen/X86/2009-09-19-earlyclobber.ll
+++ b/test/CodeGen/X86/2009-09-19-earlyclobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s | FileCheck %s
+; RUN: llc -no-integrated-as < %s | FileCheck %s
 ; ModuleID = '4964.c'
 ; PR 4964
 ; Registers other than RAX, RCX are OK, but they must be different.
diff --git a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
index 08a99e3..b828c27 100644
--- a/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
+++ b/test/CodeGen/X86/2009-11-16-UnfoldMemOpBug.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 | FileCheck %s
 ; rdar://7396984
 
-@str = private constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
+@str = private unnamed_addr constant [28 x i8] c"xxxxxxxxxxxxxxxxxxxxxxxxxxx\00", align 1
 
 define void @t(i32 %count) ssp nounwind {
 entry:
diff --git a/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll b/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll
index b166447..5c10c55 100644
--- a/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll
+++ b/test/CodeGen/X86/2009-12-01-EarlyClobberBug.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
 ; pr5391
 
 define void @t() nounwind ssp {
diff --git a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
index 74a5ec2..fc8c895 100644
--- a/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
+++ b/test/CodeGen/X86/2010-05-05-LocalAllocEarlyClobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -O0 -regalloc=fast | FileCheck %s
+; RUN: llc < %s -O0 -regalloc=fast -no-integrated-as | FileCheck %s
 ; PR6520
 
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32"
diff --git a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
index c5736eb..e11b538 100644
--- a/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
+++ b/test/CodeGen/X86/2010-05-26-DotDebugLoc.ll
@@ -26,7 +26,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !0 = metadata !{i32 786484, i32 0, metadata !1, metadata !"ret", metadata !"ret", metadata !"", metadata !1, i32 7, metadata !3, i1 false, i1 true, null, null} ; [ DW_TAG_variable ]
 !1 = metadata !{i32 786473, metadata !36} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !31, metadata !""} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{i32 786449, metadata !36, i32 1, metadata !"4.2.1 (Based on Apple Inc. build 5658) (LLVM build)", i1 true, metadata !"", i32 0, metadata !37, metadata !37, metadata !32, metadata !31,  metadata !37, metadata !""} ; [ DW_TAG_compile_unit ]
 !3 = metadata !{i32 786468, metadata !36, metadata !1, metadata !"int", i32 0, i64 32, i64 32, i64 0, i32 0, i32 5} ; [ DW_TAG_base_type ]
 !4 = metadata !{i32 786689, metadata !5, metadata !"x", metadata !1, i32 12, metadata !3, i32 0, null} ; [ DW_TAG_arg_variable ]
 !5 = metadata !{i32 786478, metadata !36, metadata !1, metadata !"foo", metadata !"foo", metadata !"foo", i32 13, metadata !6, i1 false, i1 true, i32 0, i32 0, null, i1 false, i1 true, void (i32)* @foo, null, null, metadata !33, i32 13} ; [ DW_TAG_subprogram ]
@@ -61,7 +61,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !34 = metadata !{metadata !8}
 !35 = metadata !{metadata !18, metadata !25, metadata !26}
 !36 = metadata !{metadata !"foo.c", metadata !"/tmp/"}
-!37 = metadata !{i32 0}
+!37 = metadata !{}
 
 ; The variable bar:myvar changes registers after the first movq.
 ; It is cobbered by popq %rbx
diff --git a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
index 9b47bb7..0f8855d 100644
--- a/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
+++ b/test/CodeGen/X86/2010-06-15-FastAllocEarlyCLobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -regalloc=fast -optimize-regalloc=0 < %s | FileCheck %s
+; RUN: llc -regalloc=fast -optimize-regalloc=0 -no-integrated-as < %s | FileCheck %s
 ; PR7382
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
index 68a6a13..0df9dc1 100644
--- a/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
+++ b/test/CodeGen/X86/2010-06-25-asm-RA-crash.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-fp-elim -mtriple=i686-pc-mingw32
+; RUN: llc < %s -disable-fp-elim -mtriple=i686-pc-mingw32 -no-integrated-as
 
 %struct.__SEH2Frame = type {}
 
diff --git a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
index e1491a0..d7bc21f 100644
--- a/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
+++ b/test/CodeGen/X86/2010-06-28-FastAllocTiedOperand.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -O0 | FileCheck %s
+; RUN: llc < %s -march=x86 -O0 -no-integrated-as | FileCheck %s
 ; PR7509
 target triple = "i386-apple-darwin10"
 %asmtype = type { i32, i8*, i32, i32 }
diff --git a/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll b/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll
index 82dac9d..a0798ae 100644
--- a/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll
+++ b/test/CodeGen/X86/2010-06-28-matched-g-constraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin11 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin11 -no-integrated-as | FileCheck %s
 ; Any register is OK for %0, but it must be a register, not memory.
 
 define i32 @foo() nounwind ssp {
diff --git a/test/CodeGen/X86/2010-07-02-asm-alignstack.ll b/test/CodeGen/X86/2010-07-02-asm-alignstack.ll
index 0bbb24f..4302add 100644
--- a/test/CodeGen/X86/2010-07-02-asm-alignstack.ll
+++ b/test/CodeGen/X86/2010-07-02-asm-alignstack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -no-integrated-as | FileCheck %s
 
 define void @foo() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-07-06-asm-RIP.ll b/test/CodeGen/X86/2010-07-06-asm-RIP.ll
index 9526b8d..818bbc6 100644
--- a/test/CodeGen/X86/2010-07-06-asm-RIP.ll
+++ b/test/CodeGen/X86/2010-07-06-asm-RIP.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
 ; PR 4752
 
 @n = global i32 0                                 ; <i32*> [#uses=2]
diff --git a/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll b/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll
index 97cbe3e..306e22a 100644
--- a/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll
+++ b/test/CodeGen/X86/2010-07-13-indirectXconstraint.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
 ; PR 7528
 ; formerly crashed
 
diff --git a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
index 9aa41c3..a65b632 100644
--- a/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
+++ b/test/CodeGen/X86/2010-09-16-EmptyFilename.ll
@@ -1,6 +1,6 @@
 ; RUN: llc -O0 -mtriple=x86_64-apple-darwin10 < %s - | FileCheck %s
 ; Radar 8286101
-; CHECK: .file   2 "<stdin>"
+; CHECK: .file   {{[0-9]+}} "<stdin>"
 
 define i32 @foo() nounwind ssp {
 entry:
diff --git a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
index 0e4118a..f69cedc 100644
--- a/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
+++ b/test/CodeGen/X86/2010-10-08-cmpxchg8b.ll
@@ -18,7 +18,7 @@ entry:
 loop:
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg8b
-  %r = cmpxchg i64* %ptr, i64 0, i64 1 monotonic
+  %r = cmpxchg i64* %ptr, i64 0, i64 1 monotonic monotonic
   %stored1  = icmp eq i64 %r, 0
   br i1 %stored1, label %loop, label %continue
 continue:
diff --git a/test/CodeGen/X86/2010-12-02-MC-Set.ll b/test/CodeGen/X86/2010-12-02-MC-Set.ll
deleted file mode 100644
index 5a407d3..0000000
--- a/test/CodeGen/X86/2010-12-02-MC-Set.ll
+++ /dev/null
@@ -1,27 +0,0 @@
-; RUN: llc < %s -disable-dot-loc -mtriple=x86_64-apple-darwin -O0 | FileCheck %s
-
-
-define void @foo() nounwind ssp {
-entry:
-  ret void, !dbg !5
-}
-
-!llvm.dbg.cu = !{!2}
-!llvm.module.flags = !{!10}
-!7 = metadata !{metadata !0}
-
-!0 = metadata !{i32 786478, metadata !9, metadata !1, metadata !"foo", metadata !"foo", metadata !"", i32 3, metadata !3, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 false, void ()* @foo, null, null, null, i32 0} ; [ DW_TAG_subprogram ]
-!1 = metadata !{i32 786473, metadata !9} ; [ DW_TAG_file_type ]
-!2 = metadata !{i32 786449, metadata !9, i32 12, metadata !"clang version 2.9 (trunk 120563)", i1 false, metadata !"", i32 0, metadata !8, metadata !8, metadata !7, null, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!3 = metadata !{i32 786453, metadata !9, metadata !1, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
-!4 = metadata !{null}
-!5 = metadata !{i32 5, i32 1, metadata !6, null}
-!6 = metadata !{i32 786443, metadata !9, metadata !0, i32 3, i32 16, i32 0} ; [ DW_TAG_lexical_block ]
-!8 = metadata !{i32 0}
-!9 = metadata !{metadata !"e.c", metadata !"/private/tmp"}
-
-; CHECK: .subsections_via_symbols
-; CHECK-NEXT: __debug_line
-; CHECK-NEXT: Lline_table_start0
-; CHECK-NEXT: Ltmp{{[0-9]}} = (Ltmp
-!10 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
diff --git a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
index d534030..f016528 100644
--- a/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
+++ b/test/CodeGen/X86/2011-01-24-DbgValue-Before-Use.ll
@@ -1,14 +1,20 @@
-; RUN: llc < %s | FileCheck %s
-; RUN: llc < %s -regalloc=basic | FileCheck %s
+; RUN: llc < %s -filetype=obj | llvm-dwarfdump -debug-dump=info - | FileCheck %s
+; RUN: llc < %s -filetype=obj -regalloc=basic | llvm-dwarfdump -debug-dump=info -  | FileCheck %s
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
 
 ; Check debug info for variable z_s
-;CHECK: .long Lset14
-;CHECK-NEXT:  ## DW_AT_decl_file
-;CHECK-NEXT:  ## DW_AT_decl_line
-;CHECK-NEXT:  ## DW_AT_type
-;CHECK-NEXT:  ## DW_AT_location
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_subprogram
+; CHECK: DW_TAG_variable
+; CHECK: DW_TAG_variable
+; CHECK-NEXT:   DW_AT_name {{.*}} "z_s"
+; CHECK-NEXT:   DW_AT_decl_file
+; CHECK-NEXT:   DW_AT_decl_line
+; CHECK-NEXT:   DW_AT_type{{.*}}{[[TYPE:.*]]}
+; CHECK-NEXT:   DW_AT_location
+; CHECK: [[TYPE]]:
+; CHECK-NEXT: DW_AT_name {{.*}} "int"
 
 
 @.str1 = private unnamed_addr constant [14 x i8] c"m=%u, z_s=%d\0A\00"
diff --git a/test/CodeGen/X86/2011-05-09-loaduse.ll b/test/CodeGen/X86/2011-05-09-loaduse.ll
index adcea5c..c772e4c 100644
--- a/test/CodeGen/X86/2011-05-09-loaduse.ll
+++ b/test/CodeGen/X86/2011-05-09-loaduse.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
 
 ;CHECK-LABEL: test:
-;CHECK-not: pshufd
+;CHECK-NOT: pshufd
 ;CHECK: ret
 define float @test(<4 x float>* %A) nounwind {
 entry:
diff --git a/test/CodeGen/X86/2011-10-11-SpillDead.ll b/test/CodeGen/X86/2011-10-11-SpillDead.ll
index 8e70d65..19c3d6c 100644
--- a/test/CodeGen/X86/2011-10-11-SpillDead.ll
+++ b/test/CodeGen/X86/2011-10-11-SpillDead.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -verify-regalloc
+; RUN: llc < %s -verify-regalloc -no-integrated-as
 ; PR11125
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.7"
diff --git a/test/CodeGen/X86/2011-10-19-widen_vselect.ll b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
index e08c5b2..222068d 100644
--- a/test/CodeGen/X86/2011-10-19-widen_vselect.ll
+++ b/test/CodeGen/X86/2011-10-19-widen_vselect.ll
@@ -1,12 +1,10 @@
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
 
-target triple = "x86_64-unknown-linux-gnu"
-
-; Make sure that we don't crash when legalizng vselect and vsetcc and that
+; Make sure that we don't crash when legalizing vselect and vsetcc and that
 ; we are able to generate vector blend instructions.
 
-; CHECK: simple_widen
-; CHECK: blend
+; CHECK-LABEL: simple_widen
+; CHECK-NOT: blend
 ; CHECK: ret
 define void @simple_widen() {
 entry:
@@ -15,7 +13,7 @@ entry:
   ret void
 }
 
-; CHECK: complex_inreg_work
+; CHECK-LABEL: complex_inreg_work
 ; CHECK: blend
 ; CHECK: ret
 
@@ -27,8 +25,8 @@ entry:
   ret void
 }
 
-; CHECK: zero_test
-; CHECK: blend
+; CHECK-LABEL: zero_test
+; CHECK: xorps	%xmm0, %xmm0
 ; CHECK: ret
 
 define void @zero_test() {
@@ -38,7 +36,7 @@ entry:
   ret void
 }
 
-; CHECK: full_test
+; CHECK-LABEL: full_test
 ; CHECK: blend
 ; CHECK: ret
 
diff --git a/test/CodeGen/X86/2011-12-28-vselecti8.ll b/test/CodeGen/X86/2011-12-28-vselecti8.ll
index dbc122a..c916466 100644
--- a/test/CodeGen/X86/2011-12-28-vselecti8.ll
+++ b/test/CodeGen/X86/2011-12-28-vselecti8.ll
@@ -3,10 +3,20 @@
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-darwin11.2.0"
 
-; CHECK: @foo8
-; CHECK: psll
-; CHECK: psraw
-; CHECK: pblendvb
+; During legalization, the vselect mask is 'type legalized' into a 
+; wider BUILD_VECTOR. This causes the introduction of a new
+; sign_extend_inreg in the DAG.
+;
+; A sign_extend_inreg of a vector of ConstantSDNode or undef can be
+; always folded into a simple build_vector.
+;
+; Make sure that the sign_extend_inreg is simplified and that we
+; don't generate psll, psraw and pblendvb from the vselect.
+
+; CHECK-LABEL: foo8
+; CHECK-NOT: psll
+; CHECK-NOT: psraw
+; CHECK-NOT: pblendvb
 ; CHECK: ret
 define void @foo8(float* nocapture %RET) nounwind {
 allocas:
@@ -17,4 +27,3 @@ allocas:
   ret void
 }
 
-
diff --git a/test/CodeGen/X86/2012-08-17-legalizer-crash.ll b/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
index 971e56d..0d18267 100644
--- a/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
+++ b/test/CodeGen/X86/2012-08-17-legalizer-crash.ll
@@ -27,5 +27,5 @@ if.end:                                           ; preds = %if.then, %entry
 
 ; CHECK-LABEL: fn1:
 ; CHECK: shrq $32, [[REG:%.*]]
-; CHECK: je
+; CHECK: sete
 }
diff --git a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
index d41b432..62ee1e1 100644
--- a/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-handlemove-dbg.ll
@@ -38,10 +38,8 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!12}
 
-!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !3, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
-!1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{null}
+!0 = metadata !{i32 786449, metadata !11, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/Olden/bh/newbh.c] [DW_LANG_C99]
+!2 = metadata !{}
 !4 = metadata !{i32 786689, null, metadata !"hg", metadata !5, i32 67109589, metadata !6, i32 0, i32 0} ; [ DW_TAG_arg_variable ] [hg] [line 725]
 !5 = metadata !{i32 786473, metadata !11} ; [ DW_TAG_file_type ]
 !6 = metadata !{i32 786454, metadata !11, null, metadata !"hgstruct", i32 492, i64 0, i64 0, i64 0, i32 0, metadata !7} ; [ DW_TAG_typedef ] [hgstruct] [line 492, size 0, align 0, offset 0] [from ]
diff --git a/test/CodeGen/X86/2012-11-30-misched-dbg.ll b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
index 7befa6b..650839a 100644
--- a/test/CodeGen/X86/2012-11-30-misched-dbg.ll
+++ b/test/CodeGen/X86/2012-11-30-misched-dbg.ll
@@ -65,10 +65,9 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!35}
 
-!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !3, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
+!0 = metadata !{i32 786449, metadata !19, i32 12, metadata !"clang version 3.3 (trunk 168918) (llvm/trunk 168920)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ] [MultiSource/Benchmarks/MiBench/consumer-typeset/MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c] [DW_LANG_C99]
 !1 = metadata !{metadata !2}
-!2 = metadata !{i32 0}
-!3 = metadata !{}
+!2 = metadata !{}
 !4 = metadata !{i32 786688, metadata !5, metadata !"num1", metadata !14, i32 815, metadata !15, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [num1] [line 815]
 !5 = metadata !{i32 786443, metadata !6, i32 815, i32 0, metadata !14, i32 177} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !6 = metadata !{i32 786443, metadata !7, i32 812, i32 0, metadata !14, i32 176} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
@@ -78,7 +77,7 @@ declare i32 @__sprintf_chk(i8*, i32, i64, i8*, ...)
 !10 = metadata !{i32 786443, metadata !11, i32 434, i32 0, metadata !14, i32 90} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !11 = metadata !{i32 786443, metadata !12, i32 250, i32 0, metadata !14, i32 24} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !12 = metadata !{i32 786443, metadata !13, i32 249, i32 0, metadata !14, i32 23} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
-!13 = metadata !{i32 786443, metadata !3, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
+!13 = metadata !{i32 786443, metadata !2, i32 221, i32 0, metadata !14, i32 19} ; [ DW_TAG_lexical_block ] [MultiSource/Benchmarks/MiBench/consumer-typeset/z19.c]
 !14 = metadata !{i32 786473, metadata !19} ; [ DW_TAG_file_type ]
 !15 = metadata !{i32 786433, null, null, metadata !"", i32 0, i64 160, i64 8, i32 0, i32 0, metadata !16, metadata !17, i32 0, null, null, null} ; [ DW_TAG_array_type ] [line 0, size 160, align 8, offset 0] [from char]
 !16 = metadata !{i32 786468, null, null, metadata !"char", i32 0, i64 8, i64 8, i64 0, i32 0, i32 6} ; [ DW_TAG_base_type ] [char] [line 0, size 8, align 8, offset 0, enc DW_ATE_signed_char]
diff --git a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
index 3455b68..bbba796 100644
--- a/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
+++ b/test/CodeGen/X86/2013-10-14-FastISel-incorrect-vreg.ll
@@ -3,7 +3,7 @@
 ; During X86 fastisel, the address of indirect call was resolved
 ; through bitcast, ptrtoint, and inttoptr instructions. This is valid
 ; only if the related instructions are in that same basic block, otherwise
-; we may reference variables that were not live accross basic blocks
+; we may reference variables that were not live across basic blocks
 ; resulting in undefined virtual registers.
 ;
 ; In this example, this is illustrated by a the spill/reload of the
@@ -25,7 +25,7 @@
 ; CHECK: movq [[ARG2_SLOT]], %rdi
 ; Load the second argument
 ; CHECK: movq [[ARG2_SLOT]], %rsi
-; Load the thrid argument
+; Load the third argument
 ; CHECK: movq [[ARG2_SLOT]], %rdx
 ; Load the function pointer.
 ; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
@@ -64,7 +64,7 @@ label_end:
 ; CHECK: movq [[ARG2_SLOT]], %rdi
 ; Load the second argument
 ; CHECK: movq [[ARG2_SLOT]], %rsi
-; Load the thrid argument
+; Load the third argument
 ; CHECK: movq [[ARG2_SLOT]], %rdx
 ; Load the function pointer.
 ; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
@@ -103,7 +103,7 @@ label_end:
 ; CHECK: movq [[ARG2_SLOT]], %rdi
 ; Load the second argument
 ; CHECK: movq [[ARG2_SLOT]], %rsi
-; Load the thrid argument
+; Load the third argument
 ; CHECK: movq [[ARG2_SLOT]], %rdx
 ; Load the function pointer.
 ; CHECK: movq [[LOADED_PTR_SLOT]], [[FCT_PTR:%[a-z]+]]
diff --git a/test/CodeGen/X86/3addr-16bit.ll b/test/CodeGen/X86/3addr-16bit.ll
index fafdfdb..2d6a5e7 100644
--- a/test/CodeGen/X86/3addr-16bit.ll
+++ b/test/CodeGen/X86/3addr-16bit.ll
@@ -34,7 +34,7 @@ entry:
 
 ; 64BIT-LABEL:     t2:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     decl %eax
+; 64BIT:     leal -1(%rsi), %eax
 ; 64BIT:     movzwl %ax
   %0 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   %1 = add i16 %k, -1                             ; <i16> [#uses=3]
@@ -59,7 +59,7 @@ entry:
 
 ; 64BIT-LABEL:     t3:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     addl $2, %eax
+; 64BIT:     leal 2(%rsi), %eax
   %0 = add i16 %k, 2                              ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
@@ -82,7 +82,7 @@ entry:
 
 ; 64BIT-LABEL:     t4:
 ; 64BIT-NOT: movw %si, %ax
-; 64BIT:     addl %edi, %eax
+; 64BIT:     leal (%rsi,%rdi), %eax
   %0 = add i16 %k, %c                             ; <i16> [#uses=3]
   %1 = icmp eq i16 %k, %c                         ; <i1> [#uses=1]
   br i1 %1, label %bb, label %bb1
diff --git a/test/CodeGen/X86/Atomics-64.ll b/test/CodeGen/X86/Atomics-64.ll
index 8b0a349..c274688 100644
--- a/test/CodeGen/X86/Atomics-64.ll
+++ b/test/CodeGen/X86/Atomics-64.ll
@@ -704,7 +704,7 @@ entry:
   %3 = zext i8 %2 to i32
   %4 = trunc i32 %3 to i8
   %5 = trunc i32 %1 to i8
-  %6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic
+  %6 = cmpxchg i8* @sc, i8 %4, i8 %5 monotonic monotonic
   store i8 %6, i8* @sc, align 1
   %7 = load i8* @sc, align 1
   %8 = zext i8 %7 to i32
@@ -712,7 +712,7 @@ entry:
   %10 = zext i8 %9 to i32
   %11 = trunc i32 %10 to i8
   %12 = trunc i32 %8 to i8
-  %13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic
+  %13 = cmpxchg i8* @uc, i8 %11, i8 %12 monotonic monotonic
   store i8 %13, i8* @uc, align 1
   %14 = load i8* @sc, align 1
   %15 = sext i8 %14 to i16
@@ -722,7 +722,7 @@ entry:
   %19 = bitcast i8* bitcast (i16* @ss to i8*) to i16*
   %20 = trunc i32 %18 to i16
   %21 = trunc i32 %16 to i16
-  %22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic
+  %22 = cmpxchg i16* %19, i16 %20, i16 %21 monotonic monotonic
   store i16 %22, i16* @ss, align 2
   %23 = load i8* @sc, align 1
   %24 = sext i8 %23 to i16
@@ -732,49 +732,49 @@ entry:
   %28 = bitcast i8* bitcast (i16* @us to i8*) to i16*
   %29 = trunc i32 %27 to i16
   %30 = trunc i32 %25 to i16
-  %31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic
+  %31 = cmpxchg i16* %28, i16 %29, i16 %30 monotonic monotonic
   store i16 %31, i16* @us, align 2
   %32 = load i8* @sc, align 1
   %33 = sext i8 %32 to i32
   %34 = load i8* @uc, align 1
   %35 = zext i8 %34 to i32
   %36 = bitcast i8* bitcast (i32* @si to i8*) to i32*
-  %37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic
+  %37 = cmpxchg i32* %36, i32 %35, i32 %33 monotonic monotonic
   store i32 %37, i32* @si, align 4
   %38 = load i8* @sc, align 1
   %39 = sext i8 %38 to i32
   %40 = load i8* @uc, align 1
   %41 = zext i8 %40 to i32
   %42 = bitcast i8* bitcast (i32* @ui to i8*) to i32*
-  %43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic
+  %43 = cmpxchg i32* %42, i32 %41, i32 %39 monotonic monotonic
   store i32 %43, i32* @ui, align 4
   %44 = load i8* @sc, align 1
   %45 = sext i8 %44 to i64
   %46 = load i8* @uc, align 1
   %47 = zext i8 %46 to i64
   %48 = bitcast i8* bitcast (i64* @sl to i8*) to i64*
-  %49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic
+  %49 = cmpxchg i64* %48, i64 %47, i64 %45 monotonic monotonic
   store i64 %49, i64* @sl, align 8
   %50 = load i8* @sc, align 1
   %51 = sext i8 %50 to i64
   %52 = load i8* @uc, align 1
   %53 = zext i8 %52 to i64
   %54 = bitcast i8* bitcast (i64* @ul to i8*) to i64*
-  %55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic
+  %55 = cmpxchg i64* %54, i64 %53, i64 %51 monotonic monotonic
   store i64 %55, i64* @ul, align 8
   %56 = load i8* @sc, align 1
   %57 = sext i8 %56 to i64
   %58 = load i8* @uc, align 1
   %59 = zext i8 %58 to i64
   %60 = bitcast i8* bitcast (i64* @sll to i8*) to i64*
-  %61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic
+  %61 = cmpxchg i64* %60, i64 %59, i64 %57 monotonic monotonic
   store i64 %61, i64* @sll, align 8
   %62 = load i8* @sc, align 1
   %63 = sext i8 %62 to i64
   %64 = load i8* @uc, align 1
   %65 = zext i8 %64 to i64
   %66 = bitcast i8* bitcast (i64* @ull to i8*) to i64*
-  %67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic
+  %67 = cmpxchg i64* %66, i64 %65, i64 %63 monotonic monotonic
   store i64 %67, i64* @ull, align 8
   %68 = load i8* @sc, align 1
   %69 = zext i8 %68 to i32
@@ -782,7 +782,7 @@ entry:
   %71 = zext i8 %70 to i32
   %72 = trunc i32 %71 to i8
   %73 = trunc i32 %69 to i8
-  %74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic
+  %74 = cmpxchg i8* @sc, i8 %72, i8 %73 monotonic monotonic
   %75 = icmp eq i8 %74, %72
   %76 = zext i1 %75 to i8
   %77 = zext i8 %76 to i32
@@ -793,7 +793,7 @@ entry:
   %81 = zext i8 %80 to i32
   %82 = trunc i32 %81 to i8
   %83 = trunc i32 %79 to i8
-  %84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic
+  %84 = cmpxchg i8* @uc, i8 %82, i8 %83 monotonic monotonic
   %85 = icmp eq i8 %84, %82
   %86 = zext i1 %85 to i8
   %87 = zext i8 %86 to i32
@@ -805,7 +805,7 @@ entry:
   %92 = zext i8 %91 to i32
   %93 = trunc i32 %92 to i8
   %94 = trunc i32 %90 to i8
-  %95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic
+  %95 = cmpxchg i8* bitcast (i16* @ss to i8*), i8 %93, i8 %94 monotonic monotonic
   %96 = icmp eq i8 %95, %93
   %97 = zext i1 %96 to i8
   %98 = zext i8 %97 to i32
@@ -817,7 +817,7 @@ entry:
   %103 = zext i8 %102 to i32
   %104 = trunc i32 %103 to i8
   %105 = trunc i32 %101 to i8
-  %106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic
+  %106 = cmpxchg i8* bitcast (i16* @us to i8*), i8 %104, i8 %105 monotonic monotonic
   %107 = icmp eq i8 %106, %104
   %108 = zext i1 %107 to i8
   %109 = zext i8 %108 to i32
@@ -828,7 +828,7 @@ entry:
   %113 = zext i8 %112 to i32
   %114 = trunc i32 %113 to i8
   %115 = trunc i32 %111 to i8
-  %116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic
+  %116 = cmpxchg i8* bitcast (i32* @si to i8*), i8 %114, i8 %115 monotonic monotonic
   %117 = icmp eq i8 %116, %114
   %118 = zext i1 %117 to i8
   %119 = zext i8 %118 to i32
@@ -839,7 +839,7 @@ entry:
   %123 = zext i8 %122 to i32
   %124 = trunc i32 %123 to i8
   %125 = trunc i32 %121 to i8
-  %126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic
+  %126 = cmpxchg i8* bitcast (i32* @ui to i8*), i8 %124, i8 %125 monotonic monotonic
   %127 = icmp eq i8 %126, %124
   %128 = zext i1 %127 to i8
   %129 = zext i8 %128 to i32
@@ -850,7 +850,7 @@ entry:
   %133 = zext i8 %132 to i64
   %134 = trunc i64 %133 to i8
   %135 = trunc i64 %131 to i8
-  %136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic
+  %136 = cmpxchg i8* bitcast (i64* @sl to i8*), i8 %134, i8 %135 monotonic monotonic
   %137 = icmp eq i8 %136, %134
   %138 = zext i1 %137 to i8
   %139 = zext i8 %138 to i32
@@ -861,7 +861,7 @@ entry:
   %143 = zext i8 %142 to i64
   %144 = trunc i64 %143 to i8
   %145 = trunc i64 %141 to i8
-  %146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic
+  %146 = cmpxchg i8* bitcast (i64* @ul to i8*), i8 %144, i8 %145 monotonic monotonic
   %147 = icmp eq i8 %146, %144
   %148 = zext i1 %147 to i8
   %149 = zext i8 %148 to i32
@@ -872,7 +872,7 @@ entry:
   %153 = zext i8 %152 to i64
   %154 = trunc i64 %153 to i8
   %155 = trunc i64 %151 to i8
-  %156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic
+  %156 = cmpxchg i8* bitcast (i64* @sll to i8*), i8 %154, i8 %155 monotonic monotonic
   %157 = icmp eq i8 %156, %154
   %158 = zext i1 %157 to i8
   %159 = zext i8 %158 to i32
@@ -883,7 +883,7 @@ entry:
   %163 = zext i8 %162 to i64
   %164 = trunc i64 %163 to i8
   %165 = trunc i64 %161 to i8
-  %166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic
+  %166 = cmpxchg i8* bitcast (i64* @ull to i8*), i8 %164, i8 %165 monotonic monotonic
   %167 = icmp eq i8 %166, %164
   %168 = zext i1 %167 to i8
   %169 = zext i8 %168 to i32
diff --git a/test/CodeGen/X86/GC/ocaml-gc.ll b/test/CodeGen/X86/GC/ocaml-gc.ll
index 6d5f8ae..37ddaf9 100644
--- a/test/CodeGen/X86/GC/ocaml-gc.ll
+++ b/test/CodeGen/X86/GC/ocaml-gc.ll
@@ -1,8 +1,10 @@
 ; RUN: llc < %s -mtriple=x86_64-linux-gnu | FileCheck %s
 
-define i32 @main(i32 %x) nounwind gc "ocaml" {
 ; CHECK:        .text
-; CHECK-NEXT:   .globl "caml<stdin>__code_begin"
+; CHECK-NEXT:   .file   "<stdin>"
+
+define i32 @main(i32 %x) nounwind gc "ocaml" {
+; CHECK:   .globl "caml<stdin>__code_begin"
 ; CHECK-NEXT: "caml<stdin>__code_begin":
 ; CHECK-NEXT:   .data
 ; CHECK-NEXT:   .globl  "caml<stdin>__data_begin"
diff --git a/test/CodeGen/X86/MachineBranchProb.ll b/test/CodeGen/X86/MachineBranchProb.ll
new file mode 100644
index 0000000..a893152
--- /dev/null
+++ b/test/CodeGen/X86/MachineBranchProb.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+;; Make sure a transformation in SelectionDAGBuilder that converts "or + br" to
+;; two branches correctly updates the branch probability.
+
+@max_regno = common global i32 0, align 4
+
+define void @test(i32* %old, i32 %final) {
+for.cond:
+  br label %for.cond2
+
+for.cond2:                                        ; preds = %for.inc, %for.cond
+  %i.1 = phi i32 [ %inc19, %for.inc ], [ 0, %for.cond ]
+  %bit.0 = phi i32 [ %shl, %for.inc ], [ 1, %for.cond ]
+  %tobool = icmp eq i32 %bit.0, 0
+  %v3 = load i32* @max_regno, align 4
+  %cmp4 = icmp eq i32 %i.1, %v3
+  %or.cond = or i1 %tobool, %cmp4
+  br i1 %or.cond, label %for.inc20, label %for.inc, !prof !0
+; CHECK: BB#1: derived from LLVM BB %for.cond2
+; CHECK: Successors according to CFG: BB#3(56008718) BB#4(2203492365)
+; CHECK: BB#4: derived from LLVM BB %for.cond2
+; CHECK: Successors according to CFG: BB#3(112017436) BB#2(4294967294)
+
+for.inc:                                          ; preds = %for.cond2
+  %shl = shl i32 %bit.0, 1
+  %inc19 = add nsw i32 %i.1, 1
+  br label %for.cond2
+
+for.inc20:                                        ; preds = %for.cond2
+  ret void
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 112017436, i32 -735157296}
diff --git a/test/CodeGen/X86/MachineSink-DbgValue.ll b/test/CodeGen/X86/MachineSink-DbgValue.ll
index 584e644..4ce2fb3 100644
--- a/test/CodeGen/X86/MachineSink-DbgValue.ll
+++ b/test/CodeGen/X86/MachineSink-DbgValue.ll
@@ -13,8 +13,8 @@ define i32 @foo(i32 %i, i32* nocapture %c) nounwind uwtable readonly ssp {
 
 bb1:                                     ; preds = %0
 ;CHECK: DEBUG_VALUE: a
-;CHECK-NEXT: 	.loc	1 5 5
-;CHECK-NEXT:	addl
+;CHECK:      .loc	1 5 5
+;CHECK-NEXT: addl
   %gh = add nsw i32 %ab, 2, !dbg !16
   br label %bb2, !dbg !16
 
diff --git a/test/CodeGen/X86/alias-error.ll b/test/CodeGen/X86/alias-error.ll
deleted file mode 100644
index 8f01dcf..0000000
--- a/test/CodeGen/X86/alias-error.ll
+++ /dev/null
@@ -1,5 +0,0 @@
-; RUN: not llc -mtriple=i686-pc-linux-gnu %s -o /dev/null 2>&1 | FileCheck %s
-
-@a = external global i32
-@b = alias i32* @a
-; CHECK: b: Target doesn't support aliases to declarations
diff --git a/test/CodeGen/X86/anyregcc-crash.ll b/test/CodeGen/X86/anyregcc-crash.ll
index cf6f6ed..3abe3d1 100644
--- a/test/CodeGen/X86/anyregcc-crash.ll
+++ b/test/CodeGen/X86/anyregcc-crash.ll
@@ -7,11 +7,11 @@ define i64 @anyreglimit(i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                         i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                         i64 %v13, i64 %v14, i64 %v15, i64 %v16) {
 entry:
-  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 16,
                 i64 %v1, i64 %v2, i64 %v3, i64 %v4, i64 %v5, i64 %v6,
                 i64 %v7, i64 %v8, i64 %v9, i64 %v10, i64 %v11, i64 %v12,
                 i64 %v13, i64 %v14, i64 %v15, i64 %v16)
   ret i64 %result
 }
 
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/anyregcc.ll b/test/CodeGen/X86/anyregcc.ll
index 8109f87..98ba17c 100644
--- a/test/CodeGen/X86/anyregcc.ll
+++ b/test/CodeGen/X86/anyregcc.ll
@@ -1,17 +1,44 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin                  -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7     -disable-fp-elim | FileCheck --check-prefix=SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck --check-prefix=AVX %s
+
 
 ; Stackmap Header: no constants - 6 callsites
-; CHECK-LABEL: .section	__LLVM_STACKMAPS,__llvm_stackmaps
-; CHECK-NEXT:  __LLVM_StackMaps:
+; CHECK-LABEL:  .section __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:   __LLVM_StackMaps:
 ; Header
-; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 8
 ; Num Constants
-; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .long 0
 ; Num Callsites
-; CHECK-NEXT:   .long   8
+; CHECK-NEXT:   .long 8
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _test
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _property_access1
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _property_access2
+; CHECK-NEXT:   .quad 24
+; CHECK-NEXT:   .quad _property_access3
+; CHECK-NEXT:   .quad 24
+; CHECK-NEXT:   .quad _anyreg_test1
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _anyreg_test2
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _patchpoint_spilldef
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _patchpoint_spillargs
+; CHECK-NEXT:   .quad 88
 
+; No constants
+
+; Callsites
 ; test
-; CHECK-NEXT:   .long   0
 ; CHECK-LABEL:  .long   L{{.*}}-_test
 ; CHECK-NEXT:   .short  0
 ; 3 locations
@@ -33,12 +60,11 @@
 ; CHECK-NEXT:   .long 3
 define i64 @test() nounwind ssp uwtable {
 entry:
-  call anyregcc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 0, i32 15, i8* null, i32 2, i32 1, i32 2, i64 3)
   ret i64 0
 }
 
 ; property access 1 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-NEXT:   .long   1
 ; CHECK-LABEL:  .long   L{{.*}}-_property_access1
 ; CHECK-NEXT:   .short  0
 ; 2 locations
@@ -56,12 +82,11 @@ entry:
 define i64 @property_access1(i8* %obj) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 1, i32 15, i8* %f, i32 1, i8* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 1, i32 15, i8* %f, i32 1, i8* %obj)
   ret i64 %ret
 }
 
 ; property access 2 - %obj is an anyreg call argument and should therefore be in a register
-; CHECK-NEXT:   .long   2
 ; CHECK-LABEL:  .long   L{{.*}}-_property_access2
 ; CHECK-NEXT:   .short  0
 ; 2 locations
@@ -80,12 +105,11 @@ define i64 @property_access2() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %f, i32 1, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %f, i32 1, i64* %obj)
   ret i64 %ret
 }
 
 ; property access 3 - %obj is a frame index
-; CHECK-NEXT:   .long   3
 ; CHECK-LABEL:  .long   L{{.*}}-_property_access3
 ; CHECK-NEXT:   .short  0
 ; 2 locations
@@ -95,21 +119,20 @@ entry:
 ; CHECK-NEXT:   .byte 8
 ; CHECK-NEXT:   .short {{[0-9]+}}
 ; CHECK-NEXT:   .long 0
-; Loc 1: Register <-- this will be folded once folding for FI is implemented
-; CHECK-NEXT:   .byte 1
+; Loc 1: Direct RBP - ofs
+; CHECK-NEXT:   .byte 2
 ; CHECK-NEXT:   .byte 8
-; CHECK-NEXT:   .short {{[0-9]+}}
-; CHECK-NEXT:   .long 0
+; CHECK-NEXT:   .short 6
+; CHECK-NEXT:   .long
 define i64 @property_access3() nounwind ssp uwtable {
 entry:
   %obj = alloca i64, align 8
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 3, i32 15, i8* %f, i32 0, i64* %obj)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 3, i32 15, i8* %f, i32 0, i64* %obj)
   ret i64 %ret
 }
 
 ; anyreg_test1
-; CHECK-NEXT:   .long   4
 ; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test1
 ; CHECK-NEXT:   .short  0
 ; 14 locations
@@ -187,12 +210,11 @@ entry:
 define i64 @anyreg_test1(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 4, i32 15, i8* %f, i32 13, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
 ; anyreg_test2
-; CHECK-NEXT:   .long   5
 ; CHECK-LABEL:  .long   L{{.*}}-_anyreg_test2
 ; CHECK-NEXT:   .short  0
 ; 14 locations
@@ -270,7 +292,7 @@ entry:
 define i64 @anyreg_test2(i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13) nounwind ssp uwtable {
 entry:
   %f = inttoptr i64 12297829382473034410 to i8*
-  %ret = call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
+  %ret = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %f, i32 8, i8* %a1, i8* %a2, i8* %a3, i8* %a4, i8* %a5, i8* %a6, i8* %a7, i8* %a8, i8* %a9, i8* %a10, i8* %a11, i8* %a12, i8* %a13)
   ret i64 %ret
 }
 
@@ -278,7 +300,6 @@ entry:
 ;
 ; <rdar://problem/15432754> [JS] Assertion: "Folded a def to a non-store!"
 ;
-; CHECK-LABEL: .long 12
 ; CHECK-LABEL: .long L{{.*}}-_patchpoint_spilldef
 ; CHECK-NEXT: .short 0
 ; CHECK-NEXT: .short 3
@@ -299,7 +320,7 @@ entry:
 ; CHECK-NEXT: .long  0
 define i64 @patchpoint_spilldef(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
-  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 12, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2)
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
   ret i64 %result
 }
@@ -308,7 +329,6 @@ entry:
 ;
 ; <rdar://problem/15487687> [JS] AnyRegCC argument ends up being spilled
 ;
-; CHECK-LABEL: .long 13
 ; CHECK-LABEL: .long L{{.*}}-_patchpoint_spillargs
 ; CHECK-NEXT: .short 0
 ; CHECK-NEXT: .short 5
@@ -330,19 +350,119 @@ entry:
 ; Loc 3: Arg2 spilled to RBP +
 ; CHECK-NEXT: .byte  3
 ; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 7
-; CHECK-NEXT: .long  {{[0-9]+}}
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
 ; Loc 4: Arg3 spilled to RBP +
 ; CHECK-NEXT: .byte  3
 ; CHECK-NEXT: .byte  8
-; CHECK-NEXT: .short 7
-; CHECK-NEXT: .long  {{[0-9]+}}
+; CHECK-NEXT: .short 6
+; CHECK-NEXT: .long
 define i64 @patchpoint_spillargs(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  %result = tail call anyregcc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 13, i32 15, i8* inttoptr (i64 0 to i8*), i32 2, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   ret i64 %result
 }
 
-declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+; Make sure all regs are spilled
+define anyregcc void @anyregcc1() {
+entry:
+;SSE-LABEL: anyregcc1
+;SSE:      pushq %rbp
+;SSE:      pushq %rax
+;SSE:      pushq %r15
+;SSE:      pushq %r14
+;SSE:      pushq %r13
+;SSE:      pushq %r12
+;SSE:      pushq %r11
+;SSE:      pushq %r10
+;SSE:      pushq %r9
+;SSE:      pushq %r8
+;SSE:      pushq %rdi
+;SSE:      pushq %rsi
+;SSE:      pushq %rdx
+;SSE:      pushq %rcx
+;SSE:      pushq %rbx
+;SSE:      movaps %xmm15
+;SSE-NEXT: movaps %xmm14
+;SSE-NEXT: movaps %xmm13
+;SSE-NEXT: movaps %xmm12
+;SSE-NEXT: movaps %xmm11
+;SSE-NEXT: movaps %xmm10
+;SSE-NEXT: movaps %xmm9
+;SSE-NEXT: movaps %xmm8
+;SSE-NEXT: movaps %xmm7
+;SSE-NEXT: movaps %xmm6
+;SSE-NEXT: movaps %xmm5
+;SSE-NEXT: movaps %xmm4
+;SSE-NEXT: movaps %xmm3
+;SSE-NEXT: movaps %xmm2
+;SSE-NEXT: movaps %xmm1
+;SSE-NEXT: movaps %xmm0
+;AVX-LABEL:anyregcc1
+;AVX:      pushq %rbp
+;AVX:      pushq %rax
+;AVX:      pushq %r15
+;AVX:      pushq %r14
+;AVX:      pushq %r13
+;AVX:      pushq %r12
+;AVX:      pushq %r11
+;AVX:      pushq %r10
+;AVX:      pushq %r9
+;AVX:      pushq %r8
+;AVX:      pushq %rdi
+;AVX:      pushq %rsi
+;AVX:      pushq %rdx
+;AVX:      pushq %rcx
+;AVX:      pushq %rbx
+;AVX:      vmovaps %ymm15
+;AVX-NEXT: vmovaps %ymm14
+;AVX-NEXT: vmovaps %ymm13
+;AVX-NEXT: vmovaps %ymm12
+;AVX-NEXT: vmovaps %ymm11
+;AVX-NEXT: vmovaps %ymm10
+;AVX-NEXT: vmovaps %ymm9
+;AVX-NEXT: vmovaps %ymm8
+;AVX-NEXT: vmovaps %ymm7
+;AVX-NEXT: vmovaps %ymm6
+;AVX-NEXT: vmovaps %ymm5
+;AVX-NEXT: vmovaps %ymm4
+;AVX-NEXT: vmovaps %ymm3
+;AVX-NEXT: vmovaps %ymm2
+;AVX-NEXT: vmovaps %ymm1
+;AVX-NEXT: vmovaps %ymm0
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"()
+  ret void
+}
+
+; Make sure we don't spill any XMMs/YMMs
+declare anyregcc void @foo()
+define void @anyregcc2() {
+entry:
+;SSE-LABEL: anyregcc2
+;SSE-NOT: movaps %xmm
+;AVX-LABEL: anyregcc2
+;AVX-NOT: vmovups %ymm
+  %a0 = call <2 x double> asm sideeffect "", "={xmm0}"() nounwind
+  %a1 = call <2 x double> asm sideeffect "", "={xmm1}"() nounwind
+  %a2 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+  %a3 = call <2 x double> asm sideeffect "", "={xmm3}"() nounwind
+  %a4 = call <2 x double> asm sideeffect "", "={xmm4}"() nounwind
+  %a5 = call <2 x double> asm sideeffect "", "={xmm5}"() nounwind
+  %a6 = call <2 x double> asm sideeffect "", "={xmm6}"() nounwind
+  %a7 = call <2 x double> asm sideeffect "", "={xmm7}"() nounwind
+  %a8 = call <2 x double> asm sideeffect "", "={xmm8}"() nounwind
+  %a9 = call <2 x double> asm sideeffect "", "={xmm9}"() nounwind
+  %a10 = call <2 x double> asm sideeffect "", "={xmm10}"() nounwind
+  %a11 = call <2 x double> asm sideeffect "", "={xmm11}"() nounwind
+  %a12 = call <2 x double> asm sideeffect "", "={xmm12}"() nounwind
+  %a13 = call <2 x double> asm sideeffect "", "={xmm13}"() nounwind
+  %a14 = call <2 x double> asm sideeffect "", "={xmm14}"() nounwind
+  %a15 = call <2 x double> asm sideeffect "", "={xmm15}"() nounwind
+  call anyregcc void @foo()
+  call void asm sideeffect "", "{xmm0},{xmm1},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},{xmm8},{xmm9},{xmm10},{xmm11},{xmm12},{xmm13},{xmm14},{xmm15}"(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2, <2 x double> %a3, <2 x double> %a4, <2 x double> %a5, <2 x double> %a6, <2 x double> %a7, <2 x double> %a8, <2 x double> %a9, <2 x double> %a10, <2 x double> %a11, <2 x double> %a12, <2 x double> %a13, <2 x double> %a14, <2 x double> %a15)
+  ret void
+}
+
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/asm-block-labels.ll b/test/CodeGen/X86/asm-block-labels.ll
index a43d430..6dbfb16 100644
--- a/test/CodeGen/X86/asm-block-labels.ll
+++ b/test/CodeGen/X86/asm-block-labels.ll
@@ -1,4 +1,4 @@
-; RUN: opt < %s -std-compile-opts | llc
+; RUN: opt < %s -std-compile-opts | llc -no-integrated-as
 ; ModuleID = 'block12.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128"
 target triple = "i686-apple-darwin8"
diff --git a/test/CodeGen/X86/asm-global-imm.ll b/test/CodeGen/X86/asm-global-imm.ll
index ebf585a..9e79f6f 100644
--- a/test/CodeGen/X86/asm-global-imm.ll
+++ b/test/CodeGen/X86/asm-global-imm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -relocation-model=static | FileCheck %s
+; RUN: llc < %s -march=x86 -relocation-model=static -no-integrated-as | FileCheck %s
 ; PR882
 
 target datalayout = "e-p:32:32"
diff --git a/test/CodeGen/X86/atom-cmpb.ll b/test/CodeGen/X86/atom-cmpb.ll
new file mode 100644
index 0000000..034bf2f
--- /dev/null
+++ b/test/CodeGen/X86/atom-cmpb.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -march=x86 -mcpu=atom | FileCheck %s
+; CHECK:        movl
+; CHECK:        movb
+; CHECK:        movb
+; CHECK:        cmpb
+; CHECK:        notb
+; CHECK:        notb
+
+; Test for checking of cancel conversion to cmp32 in Atom case 
+; in function 'X86TargetLowering::EmitCmp'
+ 
+define i8 @run_test(i8* %rd_p) {
+entry:
+  %incdec.ptr = getelementptr inbounds i8* %rd_p, i64 1
+  %ld1 = load i8* %rd_p, align 1
+  %incdec.ptr1 = getelementptr inbounds i8* %rd_p, i64 2
+  %ld2 = load i8* %incdec.ptr, align 1
+  %x4 = xor i8 %ld1, -1
+  %x5 = xor i8 %ld2, -1
+  %cmp34 = icmp ult i8 %ld2, %ld1
+  br i1 %cmp34, label %if.then3, label %if.else
+ 
+if.then3:                                        
+  %sub7 = sub i8 %x4, %x5
+  br label %if.end4
+ 
+if.else:                           
+  %sub8 = sub i8 %x5, %x4 
+  br label %if.end4 
+ 
+if.end4:          
+  %res = phi i8 [ %sub7, %if.then3 ], [ %sub8, %if.else ]
+  ret i8 %res
+
+}
+
diff --git a/test/CodeGen/X86/atomic16.ll b/test/CodeGen/X86/atomic16.ll
index ec2887e..45d3ff4 100644
--- a/test/CodeGen/X86/atomic16.ll
+++ b/test/CodeGen/X86/atomic16.ll
@@ -217,7 +217,7 @@ define void @atomic_fetch_umin16(i16 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg16() nounwind {
-  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire
+  %t1 = cmpxchg i16* @sc16, i16 0, i16 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgw
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic32.ll b/test/CodeGen/X86/atomic32.ll
index 3cb9ca1..474c0e6 100644
--- a/test/CodeGen/X86/atomic32.ll
+++ b/test/CodeGen/X86/atomic32.ll
@@ -243,7 +243,7 @@ define void @atomic_fetch_umin32(i32 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg32() nounwind {
-  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire
+  %t1 = cmpxchg i32* @sc32, i32 0, i32 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgl
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic64.ll b/test/CodeGen/X86/atomic64.ll
index aa00045..4f55edc 100644
--- a/test/CodeGen/X86/atomic64.ll
+++ b/test/CodeGen/X86/atomic64.ll
@@ -183,7 +183,7 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg64() nounwind {
-  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgq
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic6432.ll b/test/CodeGen/X86/atomic6432.ll
index 31e66c8..c0f7267 100644
--- a/test/CodeGen/X86/atomic6432.ll
+++ b/test/CodeGen/X86/atomic6432.ll
@@ -184,7 +184,7 @@ define void @atomic_fetch_umin64(i64 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg64() nounwind {
-  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire
+  %t1 = cmpxchg i64* @sc64, i64 0, i64 1 acquire acquire
 ; X32:       lock
 ; X32:       cmpxchg8b
   ret void
diff --git a/test/CodeGen/X86/atomic8.ll b/test/CodeGen/X86/atomic8.ll
index 3278ed1..203b26f 100644
--- a/test/CodeGen/X86/atomic8.ll
+++ b/test/CodeGen/X86/atomic8.ll
@@ -217,7 +217,7 @@ define void @atomic_fetch_umin8(i8 %x) nounwind {
 }
 
 define void @atomic_fetch_cmpxchg8() nounwind {
-  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire
+  %t1 = cmpxchg i8* @sc8, i8 0, i8 1 acquire acquire
 ; X64:       lock
 ; X64:       cmpxchgb
 ; X32:       lock
diff --git a/test/CodeGen/X86/atomic_op.ll b/test/CodeGen/X86/atomic_op.ll
index a378d6e..b3045ed 100644
--- a/test/CodeGen/X86/atomic_op.ll
+++ b/test/CodeGen/X86/atomic_op.ll
@@ -101,11 +101,11 @@ entry:
 	%neg1 = sub i32 0, 10		; <i32> [#uses=1]
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic
+  %16 = cmpxchg i32* %val2, i32 %neg1, i32 1 monotonic monotonic
 	store i32 %16, i32* %old
         ; CHECK: lock
         ; CHECK: cmpxchgl
-  %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic
+  %17 = cmpxchg i32* %val2, i32 1976, i32 1 monotonic monotonic
 	store i32 %17, i32* %old
         ; CHECK: movl  [[R17atomic:.*]], %eax
         ; CHECK: movl	$1401, %[[R17mask:[a-z]*]]
@@ -133,6 +133,6 @@ entry:
 ; CHECK: lock
 ; CHECK:	cmpxchgl	%{{.*}}, %gs:(%{{.*}})
 
-  %0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic
+  %0 = cmpxchg i32 addrspace(256)* %P, i32 0, i32 1 monotonic monotonic
   ret void
 }
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index a98e076..5fcd5ff 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -6,7 +6,7 @@
 ;CHECK: vblendvps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
   ret <4 x float> %vsel
 }
 
@@ -15,13 +15,13 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 ;CHECK: vblendvps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
   ret <4 x i32> %vsel
 }
 
 
 ;CHECK-LABEL: vsel_double:
-;CHECK: vblendvpd
+;CHECK: vmovsd
 ;CHECK: ret
 define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x double> %v1, <2 x double> %v2
@@ -30,7 +30,7 @@ define <2 x double> @vsel_double(<2 x double> %v1, <2 x double> %v2) {
 
 
 ;CHECK-LABEL: vsel_i64:
-;CHECK: vblendvpd
+;CHECK: vmovsd
 ;CHECK: ret
 define <2 x i64> @vsel_i64(<2 x i64> %v1, <2 x i64> %v2) {
   %vsel = select <2 x i1> <i1 true, i1 false>, <2 x i64> %v1, <2 x i64> %v2
@@ -51,6 +51,7 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 
 
 ;CHECK-LABEL: vsel_float8:
+;CHECK-NOT: vinsertf128
 ;CHECK: vblendvps
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
@@ -59,8 +60,9 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_i328:
+;CHECK-NOT: vinsertf128
 ;CHECK: vblendvps
-;CHECK: ret
+;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
   ret <8 x i32> %vsel
@@ -82,6 +84,15 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   ret <8 x i64> %vsel
 }
 
+;CHECK-LABEL: vsel_double4:
+;CHECK-NOT: vinsertf128
+;CHECK: vblendvpd
+;CHECK-NEXT: ret
+define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
+  ret <4 x double> %vsel
+}
+
 ;; TEST blend + compares
 ; CHECK: testa
 define <2 x double> @testa(<2 x double> %x, <2 x double> %y) {
diff --git a/test/CodeGen/X86/avx-cvt-2.ll b/test/CodeGen/X86/avx-cvt-2.ll
new file mode 100644
index 0000000..8cc7190
--- /dev/null
+++ b/test/CodeGen/X86/avx-cvt-2.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+
+; Check that we generate vector conversion from float to narrower int types
+
+%f32vec_t = type <8 x float>
+%i16vec_t = type <8 x i16>
+%i8vec_t =  type <8 x i8>
+
+define void @fptoui16(%f32vec_t %a, %i16vec_t *%p) {
+; CHECK-LABEL: fptoui16:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptoui %f32vec_t %a to %i16vec_t
+  store %i16vec_t %b, %i16vec_t * %p
+  ret void
+}
+
+define void @fptosi16(%f32vec_t %a, %i16vec_t *%p) {
+; CHECK-LABEL: fptosi16:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptosi %f32vec_t %a to %i16vec_t
+  store %i16vec_t %b, %i16vec_t * %p
+  ret void
+}
+
+define void @fptoui8(%f32vec_t %a, %i8vec_t *%p) {
+; CHECK-LABEL: fptoui8:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptoui %f32vec_t %a to %i8vec_t
+  store %i8vec_t %b, %i8vec_t * %p
+  ret void
+}
+
+define void @fptosi8(%f32vec_t %a, %i8vec_t *%p) {
+; CHECK-LABEL: fptosi8:
+; CHECK: vcvttps2dq %ymm
+; CHECK-NOT: vcvttss2si
+  %b = fptosi %f32vec_t %a to %i8vec_t
+  store %i8vec_t %b, %i8vec_t * %p
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-shift.ll b/test/CodeGen/X86/avx-shift.ll
index d79dfcc..a70d45a 100644
--- a/test/CodeGen/X86/avx-shift.ll
+++ b/test/CodeGen/X86/avx-shift.ll
@@ -115,8 +115,8 @@ define <8 x i32> @vshift08(<8 x i32> %a) nounwind {
 ; PR15141
 ; CHECK: _vshift13:
 ; CHECK-NOT: vpsll
-; CHECK: vcvttps2dq
-; CHECK-NEXT: vpmulld
+; CHECK-NOT: vcvttps2dq
+; CHECK: vpmulld
 define <4 x i32> @vshift13(<4 x i32> %in) {
   %T = shl <4 x i32> %in, <i32 0, i32 1, i32 2, i32 4>
   ret <4 x i32> %T
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 0956361..02aa617 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -297,3 +297,12 @@ entry:
 }
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 declare <4 x double> @llvm.x86.avx.vinsertf128.pd.256(<4 x double>, <2 x double>, i8) nounwind readnone
+
+; this test case just should not fail
+define void @test20() {
+  %a0 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double 0.000000e+00, i32 2
+  store <3 x double> %a0, <3 x double>* undef, align 1
+  %a1 = insertelement <3 x double> <double 0.000000e+00, double 0.000000e+00, double undef>, double undef, i32 2
+  store <3 x double> %a1, <3 x double>* undef, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-trunc.ll b/test/CodeGen/X86/avx-trunc.ll
index 58d0a35..bf8d9a7 100644
--- a/test/CodeGen/X86/avx-trunc.ll
+++ b/test/CodeGen/X86/avx-trunc.ll
@@ -1,13 +1,15 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
 define <4 x i32> @trunc_64_32(<4 x i64> %A) nounwind uwtable readnone ssp{
-; CHECK: trunc_64_32
-; CHECK: pshufd
+; CHECK-LABEL: trunc_64_32
+; CHECK: shufps
+; CHECK-NOT: pshufd
+; CHECK-NOT: movlhps 
   %B = trunc <4 x i64> %A to <4 x i32>
   ret <4 x i32>%B
 }
 define <8 x i16> @trunc_32_16(<8 x i32> %A) nounwind uwtable readnone ssp{
-; CHECK: trunc_32_16
+; CHECK-LABEL: trunc_32_16
 ; CHECK: pshufb
   %B = trunc <8 x i32> %A to <8 x i16>
   ret <8 x i16>%B
diff --git a/test/CodeGen/X86/avx-vbroadcast.ll b/test/CodeGen/X86/avx-vbroadcast.ll
index 0d403d4..2ebe6fd 100644
--- a/test/CodeGen/X86/avx-vbroadcast.ll
+++ b/test/CodeGen/X86/avx-vbroadcast.ll
@@ -141,3 +141,66 @@ entry:
   ret <4 x float> %t
 }
 
+
+; These tests check that a vbroadcast instruction is used when we have a splat
+; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
+; (via the insertelements).
+
+; CHECK-LABEL: splat_concat1
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss (%
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat1(float* %p) {
+  %1 = load float* %p, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = insertelement <4 x float> %2, float %1, i32 1
+  %4 = insertelement <4 x float> %3, float %1, i32 2
+  %5 = insertelement <4 x float> %4, float %1, i32 3
+  %6 = shufflevector <4 x float> %5, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %6
+}
+
+; CHECK-LABEL: splat_concat2
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss (%
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat2(float* %p) {
+  %1 = load float* %p, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %3 = insertelement <4 x float> %2, float %1, i32 1
+  %4 = insertelement <4 x float> %3, float %1, i32 2
+  %5 = insertelement <4 x float> %4, float %1, i32 3
+  %6 = insertelement <4 x float> undef, float %1, i32 0
+  %7 = insertelement <4 x float> %6, float %1, i32 1
+  %8 = insertelement <4 x float> %7, float %1, i32 2
+  %9 = insertelement <4 x float> %8, float %1, i32 3
+  %10 = shufflevector <4 x float> %5, <4 x float> %9, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %10
+}
+
+; CHECK-LABEL: splat_concat3
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd (%
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat3(double* %p) {
+  %1 = load double* %p, align 8
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = shufflevector <2 x double> %3, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x double> %4
+}
+
+; CHECK-LABEL: splat_concat4
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd (%
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat4(double* %p) {
+  %1 = load double* %p, align 8
+  %2 = insertelement <2 x double> undef, double %1, i32 0
+  %3 = insertelement <2 x double> %2, double %1, i32 1
+  %4 = insertelement <2 x double> undef, double %1, i32 0
+  %5 = insertelement <2 x double> %2, double %1, i32 1
+  %6 = shufflevector <2 x double> %3, <2 x double> %5, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %6
+}
+
diff --git a/test/CodeGen/X86/avx-vzeroupper.ll b/test/CodeGen/X86/avx-vzeroupper.ll
index bf4ab5b..a2163a2 100644
--- a/test/CodeGen/X86/avx-vzeroupper.ll
+++ b/test/CodeGen/X86/avx-vzeroupper.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -x86-use-vzeroupper -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -mattr=+avx | FileCheck %s
 
+declare i32 @foo()
 declare <4 x float> @do_sse(<4 x float>)
 declare <8 x float> @do_avx(<8 x float>)
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
@@ -36,20 +37,38 @@ entry:
   ret <8 x float> %c
 }
 
+;; Check that vzeroupper is emitted for tail calls.
+
+; CHECK: _test02
+define <4 x float> @test02(<8 x float> %a, <8 x float> %b) nounwind uwtable ssp {
+entry:
+  %add.i = fadd <8 x float> %a, %b
+  %add.low = call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %add.i, i8 0)
+  ; CHECK: vzeroupper
+  ; CHECK: jmp _do_sse
+  %call3 = tail call <4 x float> @do_sse(<4 x float> %add.low) nounwind
+  ret <4 x float> %call3
+}
+
 ;; Test the pass convergence and also that vzeroupper is only issued when necessary,
 ;; for this function it should be only once
 
-; CHECK: _test02
-define <4 x float> @test02(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+; CHECK: _test03
+define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
   %add.i = fadd <4 x float> %a, %b
-  br label %for.body
+  br label %while.cond
 
-for.body:                                         ; preds = %for.body, %entry
+while.cond: 
+  %call = tail call i32 @foo()
+  %tobool = icmp eq i32 %call, 0
+  br i1 %tobool, label %for.body, label %while.cond
+
+for.body:
   ; CHECK: LBB
   ; CHECK-NOT: vzeroupper
-  %i.018 = phi i32 [ 0, %entry ], [ %1, %for.body ]
-  %c.017 = phi <4 x float> [ %add.i, %entry ], [ %call14, %for.body ]
+  %i.018 = phi i32 [ 0, %while.cond ], [ %1, %for.body ]
+  %c.017 = phi <4 x float> [ %add.i, %while.cond ], [ %call14, %for.body ]
   ; CHECK: callq _do_sse
   %call5 = tail call <4 x float> @do_sse(<4 x float> %c.017) nounwind
   ; CHECK-NEXT: callq _do_sse
@@ -63,14 +82,14 @@ for.body:                                         ; preds = %for.body, %entry
   %exitcond = icmp eq i32 %1, 4
   br i1 %exitcond, label %for.end, label %for.body
 
-for.end:                                          ; preds = %for.body
+for.end:
   ret <4 x float> %call14
 }
 
 ;; Check that we also perform vzeroupper when we return from a function.
 
-; CHECK: _test03
-define <4 x float> @test03(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
+; CHECK: _test04
+define <4 x float> @test04(<4 x float> %a, <4 x float> %b) nounwind uwtable ssp {
 entry:
   %shuf = shufflevector <4 x float> %a, <4 x float> %b, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
   ; CHECK-NOT: vzeroupper
diff --git a/test/CodeGen/X86/avx2-gather.ll b/test/CodeGen/X86/avx2-gather.ll
index ee50c45..a9ac025 100644
--- a/test/CodeGen/X86/avx2-gather.ll
+++ b/test/CodeGen/X86/avx2-gather.ll
@@ -15,4 +15,20 @@ define <4 x float> @test_x86_avx2_gather_d_ps(i8* %a1,
 ; CHECK: vgatherdps
 ; CHECK-NOT: [[DST]]
 ; CHECK: [[DST:%xmm[0-9]+]]{{$}}
+; CHECK: vmovaps
+; CHECK: ret
+
+declare <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double>, i8*,
+                      <4 x i32>, <2 x double>, i8) nounwind readonly
+
+define <2 x double> @test_x86_avx2_gather_d_pd(i8* %a1,
+                     <4 x i32> %idx, <2 x double> %mask) {
+  %res = call <2 x double> @llvm.x86.avx2.gather.d.pd(<2 x double> undef,
+                            i8* %a1, <4 x i32> %idx, <2 x double> %mask, i8 2) ;
+  ret <2 x double> %res
+}
+
+; CHECK: test_x86_avx2_gather_d_pd
+; CHECK: vgatherdpd
+; CHECK: vmovapd
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx2-intrinsics-x86.ll b/test/CodeGen/X86/avx2-intrinsics-x86.ll
index a6141b0..ab3d591 100644
--- a/test/CodeGen/X86/avx2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/avx2-intrinsics-x86.ll
@@ -753,7 +753,7 @@ declare <16 x i16> @llvm.x86.avx2.pbroadcastw.256(<8 x i16>) nounwind readonly
 
 
 define <4 x i32> @test_x86_avx2_pbroadcastd_128(<4 x i32> %a0) {
-  ; CHECK: vpbroadcastd
+  ; CHECK: vbroadcastss
   %res = call <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32> %a0) ; <<4 x i32>> [#uses=1]
   ret <4 x i32> %res
 }
@@ -761,7 +761,7 @@ declare <4 x i32> @llvm.x86.avx2.pbroadcastd.128(<4 x i32>) nounwind readonly
 
 
 define <8 x i32> @test_x86_avx2_pbroadcastd_256(<4 x i32> %a0) {
-  ; CHECK: vpbroadcastd
+  ; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
   %res = call <8 x i32> @llvm.x86.avx2.pbroadcastd.256(<4 x i32> %a0) ; <<8 x i32>> [#uses=1]
   ret <8 x i32> %res
 }
@@ -777,7 +777,7 @@ declare <2 x i64> @llvm.x86.avx2.pbroadcastq.128(<2 x i64>) nounwind readonly
 
 
 define <4 x i64> @test_x86_avx2_pbroadcastq_256(<2 x i64> %a0) {
-  ; CHECK: vpbroadcastq
+  ; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
   %res = call <4 x i64> @llvm.x86.avx2.pbroadcastq.256(<2 x i64> %a0) ; <<4 x i64>> [#uses=1]
   ret <4 x i64> %res
 }
@@ -1142,7 +1142,7 @@ define <8 x float>  @test_gather_mask(<8 x float> %a0, float* %a,
                                       <8 x i32> %idx, <8 x float> %mask,
                                       float* nocapture %out) {
 ; CHECK: test_gather_mask
-; CHECK: vmovdqa %ymm2, [[DEST:%.*]]
+; CHECK: vmovaps %ymm2, [[DEST:%.*]]
 ; CHECK: vgatherdps [[DEST]]
 ;; gather with mask
   %a_i8 = bitcast float* %a to i8*
diff --git a/test/CodeGen/X86/avx2-shift.ll b/test/CodeGen/X86/avx2-shift.ll
index 7fdbaaa..025d52e 100644
--- a/test/CodeGen/X86/avx2-shift.ll
+++ b/test/CodeGen/X86/avx2-shift.ll
@@ -266,3 +266,36 @@ define <8 x i32> @sext_v8i32(<8 x i32> %a) nounwind {
   %c = sext <8 x i16> %b to <8 x i32>
   ret <8 x i32> %c
 }
+
+define <8 x i16> @variable_shl16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_shl16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsllvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = shl <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @variable_ashr16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_ashr16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovsxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsravd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = ashr <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+
+define <8 x i16> @variable_lshr16(<8 x i16> %lhs, <8  x i16> %rhs) {
+; CHECK-LABEL: variable_lshr16:
+; CHECK-DAG: vpmovzxwd %xmm1, [[AMT:%ymm[0-9]+]]
+; CHECK-DAG: vpmovzxwd %xmm0, [[LHS:%ymm[0-9]+]]
+; CHECK: vpsrlvd [[AMT]], [[LHS]], {{%ymm[0-9]+}}
+; CHECK: vpshufb
+; CHECK: vpermq
+  %res = lshr <8 x i16> %lhs, %rhs
+  ret <8 x i16> %res
+}
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx2-vbroadcast.ll b/test/CodeGen/X86/avx2-vbroadcast.ll
index 5610416..66f586d 100644
--- a/test/CodeGen/X86/avx2-vbroadcast.ll
+++ b/test/CodeGen/X86/avx2-vbroadcast.ll
@@ -98,7 +98,7 @@ entry:
   %qf = insertelement <16 x i16> %qe, i16 %q, i32 15
   ret <16 x i16> %qf
 }
-; CHECK: vpbroadcastd (%
+; CHECK: vbroadcastss (%
 define <4 x i32> @D32(i32* %ptr) nounwind uwtable readnone ssp {
 entry:
   %q = load i32* %ptr, align 4
@@ -108,7 +108,7 @@ entry:
   %q3 = insertelement <4 x i32> %q2, i32 %q, i32 3
   ret <4 x i32> %q3
 }
-; CHECK: vpbroadcastd (%
+; CHECK: vbroadcastss (%
 define <8 x i32> @DD32(i32* %ptr) nounwind uwtable readnone ssp {
 entry:
   %q = load i32* %ptr, align 4
@@ -130,7 +130,7 @@ entry:
   %q1 = insertelement <2 x i64> %q0, i64 %q, i32 1
   ret <2 x i64> %q1
 }
-; CHECK: vpbroadcastq (%
+; CHECK: vbroadcastsd (%
 define <4 x i64> @QQ64(i64* %ptr) nounwind uwtable readnone ssp {
 entry:
   %q = load i64* %ptr, align 4
@@ -293,7 +293,7 @@ define   <8 x i16> @_inreg8xi16(<8 x i16> %a) {
 
 
 ;CHECK-LABEL: _inreg4xi64:
-;CHECK: vpbroadcastq
+;CHECK: vbroadcastsd
 ;CHECK: ret
 define   <4 x i64> @_inreg4xi64(<4 x i64> %a) {
   %b = shufflevector <4 x i64> %a, <4 x i64> undef, <4 x i32> zeroinitializer
@@ -325,7 +325,7 @@ define   <2 x double> @_inreg2xdouble(<2 x double> %a) {
 }
 
 ;CHECK-LABEL: _inreg8xi32:
-;CHECK: vpbroadcastd
+;CHECK: vbroadcastss
 ;CHECK: ret
 define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
   %b = shufflevector <8 x i32> %a, <8 x i32> undef, <8 x i32> zeroinitializer
@@ -333,7 +333,7 @@ define   <8 x i32> @_inreg8xi32(<8 x i32> %a) {
 }
 
 ;CHECK-LABEL: _inreg4xi32:
-;CHECK: vpbroadcastd
+;CHECK: vbroadcastss
 ;CHECK: ret
 define   <4 x i32> @_inreg4xi32(<4 x i32> %a) {
   %b = shufflevector <4 x i32> %a, <4 x i32> undef, <4 x i32> zeroinitializer
@@ -355,3 +355,219 @@ define   <16 x i8> @_inreg16xi8(<16 x i8> %a) {
   %b = shufflevector <16 x i8> %a, <16 x i8> undef, <16 x i32> zeroinitializer
   ret <16 x i8> %b
 }
+
+; These tests check that a vbroadcast instruction is used when we have a splat
+; formed from a concat_vectors (via the shufflevector) of two BUILD_VECTORs
+; (via the insertelements).
+
+; CHECK-LABEL: splat_concat1
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat1(float %f) {
+  %1 = insertelement <4 x float> undef, float %f, i32 0
+  %2 = insertelement <4 x float> %1, float %f, i32 1
+  %3 = insertelement <4 x float> %2, float %f, i32 2
+  %4 = insertelement <4 x float> %3, float %f, i32 3
+  %5 = shufflevector <4 x float> %4, <4 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
+  ret <8 x float> %5
+}
+
+; CHECK-LABEL: splat_concat2
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastss
+; CHECK-NEXT: ret
+define <8 x float> @splat_concat2(float %f) {
+  %1 = insertelement <4 x float> undef, float %f, i32 0
+  %2 = insertelement <4 x float> %1, float %f, i32 1
+  %3 = insertelement <4 x float> %2, float %f, i32 2
+  %4 = insertelement <4 x float> %3, float %f, i32 3
+  %5 = insertelement <4 x float> undef, float %f, i32 0
+  %6 = insertelement <4 x float> %5, float %f, i32 1
+  %7 = insertelement <4 x float> %6, float %f, i32 2
+  %8 = insertelement <4 x float> %7, float %f, i32 3
+  %9 = shufflevector <4 x float> %4, <4 x float> %8, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  ret <8 x float> %9
+}
+
+; CHECK-LABEL: splat_concat3
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat3(double %d) {
+  %1 = insertelement <2 x double> undef, double %d, i32 0
+  %2 = insertelement <2 x double> %1, double %d, i32 1
+  %3 = shufflevector <2 x double> %2, <2 x double> undef, <4 x i32> <i32 0, i32 1, i32 0, i32 1>
+  ret <4 x double> %3
+}
+
+; CHECK-LABEL: splat_concat4
+; CHECK-NOT: vinsertf128
+; CHECK: vbroadcastsd
+; CHECK-NEXT: ret
+define <4 x double> @splat_concat4(double %d) {
+  %1 = insertelement <2 x double> undef, double %d, i32 0
+  %2 = insertelement <2 x double> %1, double %d, i32 1
+  %3 = insertelement <2 x double> undef, double %d, i32 0
+  %4 = insertelement <2 x double> %3, double %d, i32 1
+  %5 = shufflevector <2 x double> %2, <2 x double> %4, <4 x i32> <i32 0, i32 1, i32 2, i32 3>
+  ret <4 x double> %5
+}
+
+; Test cases for <rdar://problem/16074331>.
+; Instruction selection for broacast instruction fails if
+; the load cannot be folded into the broadcast.
+; This happens if the load has initial one use but other uses are
+; created later, or if selection DAG cannot prove that folding the
+; load will not create a cycle in the DAG.
+; Those test cases exerce the latter.
+
+; CHECK-LABEL: isel_crash_16b
+; CHECK: vpbroadcastb {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_16b(i8* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i8* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <16 x i8> undef, i8 %tmp2, i32 0
+  %splat.splat = shufflevector <16 x i8> %splat.splatinsert, <16 x i8> undef, <16 x i32> zeroinitializer
+  %tmp3 = bitcast <16 x i8> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_32b
+; CHECK: vpbroadcastb {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_32b(i8* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i8* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <32 x i8> undef, i8 %tmp2, i32 0
+  %splat.splat = shufflevector <32 x i8> %splat.splatinsert, <32 x i8> undef, <32 x i32> zeroinitializer
+  %tmp3 = bitcast <32 x i8> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_8w
+; CHECK: vpbroadcastw {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_8w(i16* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i16* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <8 x i16> undef, i16 %tmp2, i32 0
+  %splat.splat = shufflevector <8 x i16> %splat.splatinsert, <8 x i16> undef, <8 x i32> zeroinitializer
+  %tmp3 = bitcast <8 x i16> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_16w
+; CHECK: vpbroadcastw {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_16w(i16* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i16* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <16 x i16> undef, i16 %tmp2, i32 0
+  %splat.splat = shufflevector <16 x i16> %splat.splatinsert, <16 x i16> undef, <16 x i32> zeroinitializer
+  %tmp3 = bitcast <16 x i16> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_4d
+; CHECK: vbroadcastss {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_4d(i32* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i32* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <4 x i32> undef, i32 %tmp2, i32 0
+  %splat.splat = shufflevector <4 x i32> %splat.splatinsert, <4 x i32> undef, <4 x i32> zeroinitializer
+  %tmp3 = bitcast <4 x i32> %splat.splat to <2 x i64>
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %tmp3, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_8d
+; CHECK: vbroadcastss {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_8d(i32* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i32* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <8 x i32> undef, i32 %tmp2, i32 0
+  %splat.splat = shufflevector <8 x i32> %splat.splatinsert, <8 x i32> undef, <8 x i32> zeroinitializer
+  %tmp3 = bitcast <8 x i32> %splat.splat to <4 x i64>
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %tmp3, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_2q
+; CHECK: vpbroadcastq {{[^,]+}}, %xmm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_2q(i64* %cV_R.addr) {
+entry:
+  %__a.addr.i = alloca <2 x i64>, align 16
+  %__b.addr.i = alloca <2 x i64>, align 16
+  %vCr = alloca <2 x i64>, align 16
+  store <2 x i64> zeroinitializer, <2 x i64>* %vCr, align 16
+  %tmp = load <2 x i64>* %vCr, align 16
+  %tmp2 = load i64* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <2 x i64> undef, i64 %tmp2, i32 0
+  %splat.splat = shufflevector <2 x i64> %splat.splatinsert, <2 x i64> undef, <2 x i32> zeroinitializer
+  store <2 x i64> %tmp, <2 x i64>* %__a.addr.i, align 16
+  store <2 x i64> %splat.splat, <2 x i64>* %__b.addr.i, align 16
+  ret void
+}
+
+; CHECK-LABEL: isel_crash_4q
+; CHECK: vbroadcastsd {{[^,]+}}, %ymm{{[0-9]+}}
+; CHECK: ret
+define void @isel_crash_4q(i64* %cV_R.addr) {
+eintry:
+  %__a.addr.i = alloca <4 x i64>, align 16
+  %__b.addr.i = alloca <4 x i64>, align 16
+  %vCr = alloca <4 x i64>, align 16
+  store <4 x i64> zeroinitializer, <4 x i64>* %vCr, align 16
+  %tmp = load <4 x i64>* %vCr, align 16
+  %tmp2 = load i64* %cV_R.addr, align 4
+  %splat.splatinsert = insertelement <4 x i64> undef, i64 %tmp2, i32 0
+  %splat.splat = shufflevector <4 x i64> %splat.splatinsert, <4 x i64> undef, <4 x i32> zeroinitializer
+  store <4 x i64> %tmp, <4 x i64>* %__a.addr.i, align 16
+  store <4 x i64> %splat.splat, <4 x i64>* %__b.addr.i, align 16
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 5592e6c..4ae2905 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -9,7 +9,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllw_1:
-; CHECK: vpsllw  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsllw  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <16 x i16> @test_sllw_2(<16 x i16> %InVec) {
@@ -24,12 +24,12 @@ entry:
 
 define <16 x i16> @test_sllw_3(<16 x i16> %InVec) {
 entry:
-  %shl = shl <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = shl <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 ; CHECK-LABEL: test_sllw_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsllw $15, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_slld_1(<8 x i32> %InVec) {
@@ -39,7 +39,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_slld_1:
-; CHECK: vpslld  $0, %ymm0, %ymm0
+; CHECK-NOT: vpslld  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_slld_2(<8 x i32> %InVec) {
@@ -54,12 +54,12 @@ entry:
 
 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
 entry:
-  %shl = shl <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
 ; CHECK-LABEL: test_slld_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpslld $31, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_sllq_1(<4 x i64> %InVec) {
@@ -69,7 +69,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllq_1:
-; CHECK: vpsllq  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsllq  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_sllq_2(<4 x i64> %InVec) {
@@ -84,12 +84,12 @@ entry:
 
 define <4 x i64> @test_sllq_3(<4 x i64> %InVec) {
 entry:
-  %shl = shl <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
+  %shl = shl <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
   ret <4 x i64> %shl
 }
 
 ; CHECK-LABEL: test_sllq_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsllq $63, %ymm0, %ymm0
 ; CHECK: ret
 
 ; AVX2 Arithmetic Shift
@@ -101,7 +101,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_1:
-; CHECK: vpsraw  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsraw  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <16 x i16> @test_sraw_2(<16 x i16> %InVec) {
@@ -116,7 +116,7 @@ entry:
 
 define <16 x i16> @test_sraw_3(<16 x i16> %InVec) {
 entry:
-  %shl = ashr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = ashr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
@@ -131,7 +131,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_1:
-; CHECK: vpsrad  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrad  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srad_2(<8 x i32> %InVec) {
@@ -146,7 +146,7 @@ entry:
 
 define <8 x i32> @test_srad_3(<8 x i32> %InVec) {
 entry:
-  %shl = ashr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %shl = ashr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
@@ -163,7 +163,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlw_1:
-; CHECK: vpsrlw  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrlw  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <16 x i16> @test_srlw_2(<16 x i16> %InVec) {
@@ -178,12 +178,12 @@ entry:
 
 define <16 x i16> @test_srlw_3(<16 x i16> %InVec) {
 entry:
-  %shl = lshr <16 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = lshr <16 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <16 x i16> %shl
 }
 
 ; CHECK-LABEL: test_srlw_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsrlw $15, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srld_1(<8 x i32> %InVec) {
@@ -193,7 +193,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srld_1:
-; CHECK: vpsrld  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrld  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <8 x i32> @test_srld_2(<8 x i32> %InVec) {
@@ -208,12 +208,12 @@ entry:
 
 define <8 x i32> @test_srld_3(<8 x i32> %InVec) {
 entry:
-  %shl = lshr <8 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32, i32 32>
+  %shl = lshr <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
   ret <8 x i32> %shl
 }
 
 ; CHECK-LABEL: test_srld_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsrld $31, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_srlq_1(<4 x i64> %InVec) {
@@ -223,7 +223,7 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlq_1:
-; CHECK: vpsrlq  $0, %ymm0, %ymm0
+; CHECK-NOT: vpsrlq  $0, %ymm0, %ymm0
 ; CHECK: ret
 
 define <4 x i64> @test_srlq_2(<4 x i64> %InVec) {
@@ -238,10 +238,21 @@ entry:
 
 define <4 x i64> @test_srlq_3(<4 x i64> %InVec) {
 entry:
-  %shl = lshr <4 x i64> %InVec, <i64 64, i64 64, i64 64, i64 64>
+  %shl = lshr <4 x i64> %InVec, <i64 63, i64 63, i64 63, i64 63>
   ret <4 x i64> %shl
 }
 
 ; CHECK-LABEL: test_srlq_3:
-; CHECK: vxorps  %ymm0, %ymm0, %ymm0
+; CHECK: vpsrlq $63, %ymm0, %ymm0
 ; CHECK: ret
+
+; CHECK-LABEL: @srl_trunc_and_v4i64
+; CHECK: vpand
+; CHECK-NEXT: vpsrlvd
+; CHECK: ret
+define <4 x i32> @srl_trunc_and_v4i64(<4 x i32> %x, <4 x i64> %y) nounwind {
+  %and = and <4 x i64> %y, <i64 8, i64 8, i64 8, i64 8>
+  %trunc = trunc <4 x i64> %and to <4 x i32>
+  %sra = lshr <4 x i32> %x, %trunc
+  ret <4 x i32> %sra
+}
diff --git a/test/CodeGen/X86/avx512-arith.ll b/test/CodeGen/X86/avx512-arith.ll
index e27600e..4d1c9f7 100644
--- a/test/CodeGen/X86/avx512-arith.ll
+++ b/test/CodeGen/X86/avx512-arith.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
 ; CHECK-LABEL: addpd512
 ; CHECK: vaddpd
@@ -163,6 +163,40 @@ define <8 x i64> @vpaddq_test(<8 x i64> %i, <8 x i64> %j) nounwind readnone {
   ret <8 x i64> %x
 }
 
+; CHECK-LABEL: vpaddq_fold_test
+; CHECK: vpaddq (%
+; CHECK: ret
+define <8 x i64> @vpaddq_fold_test(<8 x i64> %i, <8 x i64>* %j) nounwind {
+  %tmp = load <8 x i64>* %j, align 4
+  %x = add <8 x i64> %i, %tmp
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq_broadcast_test
+; CHECK: vpaddq LCP{{.*}}(%rip){1to8}
+; CHECK: ret
+define <8 x i64> @vpaddq_broadcast_test(<8 x i64> %i) nounwind {
+  %x = add <8 x i64> %i, <i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1, i64 1>
+  ret <8 x i64> %x
+}
+
+; CHECK-LABEL: vpaddq_broadcast2_test
+; CHECK: vpaddq (%rdi){1to8}
+; CHECK: ret
+define <8 x i64> @vpaddq_broadcast2_test(<8 x i64> %i, i64* %j) nounwind {
+  %tmp = load i64* %j
+  %j.0 = insertelement <8 x i64> undef, i64 %tmp, i32 0
+  %j.1 = insertelement <8 x i64> %j.0, i64 %tmp, i32 1
+  %j.2 = insertelement <8 x i64> %j.1, i64 %tmp, i32 2
+  %j.3 = insertelement <8 x i64> %j.2, i64 %tmp, i32 3
+  %j.4 = insertelement <8 x i64> %j.3, i64 %tmp, i32 4
+  %j.5 = insertelement <8 x i64> %j.4, i64 %tmp, i32 5
+  %j.6 = insertelement <8 x i64> %j.5, i64 %tmp, i32 6
+  %j.7 = insertelement <8 x i64> %j.6, i64 %tmp, i32 7
+  %x = add <8 x i64> %i, %j.7
+  ret <8 x i64> %x
+}
+
 ; CHECK-LABEL: vpaddd_test
 ; CHECK: vpaddd %zmm
 ; CHECK: ret
@@ -171,6 +205,85 @@ define <16 x i32> @vpaddd_test(<16 x i32> %i, <16 x i32> %j) nounwind readnone {
   ret <16 x i32> %x
 }
 
+; CHECK-LABEL: vpaddd_fold_test
+; CHECK: vpaddd (%
+; CHECK: ret
+define <16 x i32> @vpaddd_fold_test(<16 x i32> %i, <16 x i32>* %j) nounwind {
+  %tmp = load <16 x i32>* %j, align 4
+  %x = add <16 x i32> %i, %tmp
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}
+; CHECK: ret
+define <16 x i32> @vpaddd_broadcast_test(<16 x i32> %i) nounwind {
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  ret <16 x i32> %x
+}
+
+; CHECK-LABEL: vpaddd_mask_test
+; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_mask_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_maskz_test
+; CHECK: vpaddd {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} {z} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_maskz_test(<16 x i32> %i, <16 x i32> %j, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_mask_fold_test
+; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_mask_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %j = load <16 x i32>* %j.ptr
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_mask_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]} }}
+; CHECK: ret
+define <16 x i32> @vpaddd_mask_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %i
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_maskz_fold_test
+; CHECK: vpaddd (%rdi), {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
+; CHECK: ret
+define <16 x i32> @vpaddd_maskz_fold_test(<16 x i32> %i, <16 x i32>* %j.ptr, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %j = load <16 x i32>* %j.ptr
+  %x = add <16 x i32> %i, %j
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
+; CHECK-LABEL: vpaddd_maskz_broadcast_test
+; CHECK: vpaddd LCP{{.*}}(%rip){1to16}, {{%zmm[0-9]{1,2}, %zmm[0-9]{1,2} {%k[1-7]}}} {z}
+; CHECK: ret
+define <16 x i32> @vpaddd_maskz_broadcast_test(<16 x i32> %i, <16 x i32> %mask1) nounwind readnone {
+  %mask = icmp ne <16 x i32> %mask1, zeroinitializer
+  %x = add <16 x i32> %i, <i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1, i32 1>
+  %r = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> zeroinitializer
+  ret <16 x i32> %r
+}
+
 ; CHECK-LABEL: vpsubq_test
 ; CHECK: vpsubq %zmm
 ; CHECK: ret
@@ -196,7 +309,7 @@ define <16 x i32> @vpmulld_test(<16 x i32> %i, <16 x i32> %j) {
 }
 
 ; CHECK-LABEL: sqrtA
-; CHECK: vsqrtssz
+; CHECK: vsqrtss {{.*}} encoding: [0x62
 ; CHECK: ret
 declare float @sqrtf(float) readnone
 define float @sqrtA(float %a) nounwind uwtable readnone ssp {
@@ -206,7 +319,7 @@ entry:
 }
 
 ; CHECK-LABEL: sqrtB
-; CHECK: vsqrtsdz
+; CHECK: vsqrtsd {{.*}}## encoding: [0x62
 ; CHECK: ret
 declare double @sqrt(double) readnone
 define double @sqrtB(double %a) nounwind uwtable readnone ssp {
@@ -216,7 +329,7 @@ entry:
 }
 
 ; CHECK-LABEL: sqrtC
-; CHECK: vsqrtssz
+; CHECK: vsqrtss {{.*}}## encoding: [0x62
 ; CHECK: ret
 declare float @llvm.sqrt.f32(float)
 define float @sqrtC(float %a) nounwind {
@@ -224,6 +337,24 @@ define float @sqrtC(float %a) nounwind {
   ret float %b
 }
 
+; CHECK-LABEL: sqrtD
+; CHECK: vsqrtps {{.*}}
+; CHECK: ret
+declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
+define <16 x float> @sqrtD(<16 x float> %a) nounwind {
+  %b = call <16 x float> @llvm.sqrt.v16f32(<16 x float> %a)
+  ret <16 x float> %b
+}
+
+; CHECK-LABEL: sqrtE
+; CHECK: vsqrtpd {{.*}}
+; CHECK: ret
+declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
+define <8 x double> @sqrtE(<8 x double> %a) nounwind {
+  %b = call <8 x double> @llvm.sqrt.v8f64(<8 x double> %a)
+  ret <8 x double> %b
+}
+
 ; CHECK-LABEL: fadd_broadcast
 ; CHECK: LCP{{.*}}(%rip){1to16}, %zmm0, %zmm0
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-build-vector.ll b/test/CodeGen/X86/avx512-build-vector.ll
index bc4560b..b5a2aa8 100644
--- a/test/CodeGen/X86/avx512-build-vector.ll
+++ b/test/CodeGen/X86/avx512-build-vector.ll
@@ -15,4 +15,16 @@ define <16 x i32> @test1(i32* %x) {
 define <16 x i32> @test2(<16 x i32> %x) {
    %res = add <16 x i32><i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>, %x
    ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test3
+; CHECK: vinsertf128
+; CHECK: vinsertf64x4
+; CHECK: ret
+define <16 x float> @test3(<4 x float> %a) {
+  %b = extractelement <4 x float> %a, i32 2
+  %c = insertelement <16 x float> <float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float undef, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00, float 0.000000e+00>, float %b, i32 5
+  %b1 = extractelement <4 x float> %a, i32 0
+  %c1 = insertelement <16 x float> %c, float %b1, i32 6
+  ret <16 x float>%c1
 }
 \ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-cmp.ll b/test/CodeGen/X86/avx512-cmp.ll
index ba52745..47e50a9 100644
--- a/test/CodeGen/X86/avx512-cmp.ll
+++ b/test/CodeGen/X86/avx512-cmp.ll
@@ -1,6 +1,7 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
-; CHECK: vucomisdz
+; CHECK-LABEL: test1
+; CHECK: vucomisd {{.*}}encoding: [0x62
 define double @test1(double %a, double %b) nounwind {
   %tobool = fcmp une double %a, %b
   br i1 %tobool, label %l1, label %l2
@@ -13,7 +14,8 @@ l2:
   ret double %c1
 }
 
-; CHECK: vucomissz
+; CHECK-LABEL: test2
+; CHECK: vucomiss {{.*}}encoding: [0x62
 define float @test2(float %a, float %b) nounwind {
   %tobool = fcmp olt float %a, %b
   br i1 %tobool, label %l1, label %l2
@@ -25,3 +27,62 @@ l2:
   %c1 = fadd float %a, %b
   ret float %c1
 }
+
+; CHECK-LABEL: test3
+; CHECK: vcmpeqss
+; CHECK: kmov
+; CHECK: ret
+define i32 @test3(float %a, float %b) {
+
+  %cmp10.i = fcmp oeq float %a, %b
+  %conv11.i = zext i1 %cmp10.i to i32
+  ret i32 %conv11.i
+}
+
+; CHECK-LABEL: test5
+; CHECK: ret
+define float @test5(float %p) #0 {
+entry:
+  %cmp = fcmp oeq float %p, 0.000000e+00
+  br i1 %cmp, label %return, label %if.end
+
+if.end:                                           ; preds = %entry
+  %cmp1 = fcmp ogt float %p, 0.000000e+00
+  %cond = select i1 %cmp1, float 1.000000e+00, float -1.000000e+00
+  br label %return
+
+return:                                           ; preds = %if.end, %entry
+  %retval.0 = phi float [ %cond, %if.end ], [ %p, %entry ]
+  ret float %retval.0
+}
+
+; CHECK-LABEL: test6
+; CHECK: cmpl
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test6(i32 %a, i32 %b) {
+  %cmp = icmp eq i32 %a, %b
+  %res = zext i1 %cmp to i32
+  ret i32 %res
+}
+
+; CHECK-LABEL: test7
+; CHECK: vucomisd
+; CHECK-NOT: kmov
+; CHECK: ret
+define i32 @test7(double %x, double %y) #2 {
+entry:
+  %0 = fcmp one double %x, %y
+  %or = zext i1 %0 to i32
+  ret i32 %or
+}
+
+define i32 @test8(i32 %a1, i32 %a2, i32 %a3) {
+  %tmp1 = icmp eq i32 %a1, -1
+  %tmp2 = icmp eq i32 %a2, -2147483648
+  %tmp3 = and i1 %tmp1, %tmp2
+  %tmp4 = icmp eq i32 %a3, 0
+  %tmp5 = or i1 %tmp3, %tmp4
+  %res = select i1 %tmp5, i32 1, i32 %a3
+  ret i32 %res
+}
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index ed68ff7..1d83485 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding | FileCheck %s
 
 ; CHECK-LABEL: sitof32
 ; CHECK: vcvtdq2ps %zmm
@@ -67,7 +67,7 @@ define <8 x double> @fpext00(<8 x float> %b) nounwind {
 }
 
 ; CHECK-LABEL: funcA
-; CHECK: vcvtsi2sdqz (%
+; CHECK: vcvtsi2sdq (%rdi){{.*}} encoding: [0x62
 ; CHECK: ret
 define double @funcA(i64* nocapture %e) {
 entry:
@@ -77,7 +77,7 @@ entry:
 }
 
 ; CHECK-LABEL: funcB
-; CHECK: vcvtsi2sdlz (%
+; CHECK: vcvtsi2sdl (%{{.*}} encoding: [0x62
 ; CHECK: ret
 define double @funcB(i32* %e) {
 entry:
@@ -87,7 +87,7 @@ entry:
 }
 
 ; CHECK-LABEL: funcC
-; CHECK: vcvtsi2sslz (%
+; CHECK: vcvtsi2ssl (%{{.*}} encoding: [0x62
 ; CHECK: ret
 define float @funcC(i32* %e) {
 entry:
@@ -97,7 +97,7 @@ entry:
 }
 
 ; CHECK-LABEL: i64tof32
-; CHECK: vcvtsi2ssqz  (%
+; CHECK: vcvtsi2ssq  (%{{.*}} encoding: [0x62
 ; CHECK: ret
 define float @i64tof32(i64* %e) {
 entry:
@@ -107,7 +107,7 @@ entry:
 }
 
 ; CHECK-LABEL: fpext
-; CHECK: vcvtss2sdz
+; CHECK: vcvtss2sd {{.*}} encoding: [0x62
 ; CHECK: ret
 define void @fpext() {
 entry:
@@ -120,9 +120,9 @@ entry:
 }
 
 ; CHECK-LABEL: fpround_scalar
-; CHECK: vmovsdz
-; CHECK: vcvtsd2ssz
-; CHECK: vmovssz
+; CHECK: vmovsd {{.*}} encoding: [0x62
+; CHECK: vcvtsd2ss {{.*}} encoding: [0x62
+; CHECK: vmovss {{.*}} encoding: [0x62
 ; CHECK: ret
 define void @fpround_scalar() nounwind uwtable {
 entry:
@@ -135,7 +135,7 @@ entry:
 }
 
 ; CHECK-LABEL: long_to_double
-; CHECK: vmovqz
+; CHECK: vmovq {{.*}} encoding: [0x62
 ; CHECK: ret
 define double @long_to_double(i64 %x) {
    %res = bitcast i64 %x to double
@@ -143,7 +143,7 @@ define double @long_to_double(i64 %x) {
 }
 
 ; CHECK-LABEL: double_to_long
-; CHECK: vmovqz
+; CHECK: vmovq {{.*}} encoding: [0x62
 ; CHECK: ret
 define i64 @double_to_long(double %x) {
    %res = bitcast double %x to i64
@@ -151,7 +151,7 @@ define i64 @double_to_long(double %x) {
 }
 
 ; CHECK-LABEL: int_to_float
-; CHECK: vmovdz
+; CHECK: vmovd {{.*}} encoding: [0x62
 ; CHECK: ret
 define float @int_to_float(i32 %x) {
    %res = bitcast i32 %x to float
@@ -159,7 +159,7 @@ define float @int_to_float(i32 %x) {
 }
 
 ; CHECK-LABEL: float_to_int
-; CHECK: vmovdz
+; CHECK: vmovd {{.*}} encoding: [0x62
 ; CHECK: ret
 define i32 @float_to_int(float %x) {
    %res = bitcast float %x to i32
@@ -185,7 +185,7 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
 }
 
 ; CHECK-LABEL: @fptosi02
-; CHECK vcvttss2siz
+; CHECK: vcvttss2si {{.*}} encoding: [0x62
 ; CHECK: ret
 define i32 @fptosi02(float %a) nounwind {
   %b = fptosi float %a to i32
@@ -193,7 +193,7 @@ define i32 @fptosi02(float %a) nounwind {
 }
 
 ; CHECK-LABEL: @fptoui02
-; CHECK vcvttss2usiz
+; CHECK: vcvttss2usi {{.*}} encoding: [0x62
 ; CHECK: ret
 define i32 @fptoui02(float %a) nounwind {
   %b = fptoui float %a to i32
@@ -201,7 +201,7 @@ define i32 @fptoui02(float %a) nounwind {
 }
 
 ; CHECK-LABEL: @uitofp02
-; CHECK vcvtusi2ss
+; CHECK: vcvtusi2ss
 ; CHECK: ret
 define float @uitofp02(i32 %a) nounwind {
   %b = uitofp i32 %a to float
@@ -209,7 +209,7 @@ define float @uitofp02(i32 %a) nounwind {
 }
 
 ; CHECK-LABEL: @uitofp03
-; CHECK vcvtusi2sd
+; CHECK: vcvtusi2sd
 ; CHECK: ret
 define double @uitofp03(i32 %a) nounwind {
   %b = uitofp i32 %a to double
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index 0321e95..e429a22 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -223,3 +223,81 @@ define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
   call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
   ret void
 }
+
+;CHECK-LABEL: gather_mask_dpd_execdomain
+;CHECK: vgatherdpd
+;CHECK: vmovapd
+;CHECK: ret
+define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  store <8 x double> %x, <8 x double>* %stbuf
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_qpd_execdomain
+;CHECK: vgatherqpd
+;CHECK: vmovapd
+;CHECK: ret
+define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  store <8 x double> %x, <8 x double>* %stbuf
+  ret void
+}
+
+;CHECK-LABEL: gather_mask_dps_execdomain
+;CHECK: vgatherdps
+;CHECK: vmovaps 
+;CHECK: ret
+define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
+  %res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  ret <16 x float> %res;
+}
+
+;CHECK-LABEL: gather_mask_qps_execdomain
+;CHECK: vgatherqps
+;CHECK: vmovaps
+;CHECK: ret
+define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
+  %res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  ret <8 x float> %res;
+}
+
+;CHECK-LABEL: scatter_mask_dpd_execdomain
+;CHECK: vmovapd
+;CHECK: vscatterdpd
+;CHECK: ret
+define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <8 x double>* %src, align 64 
+  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: scatter_mask_qpd_execdomain
+;CHECK: vmovapd
+;CHECK: vscatterqpd
+;CHECK: ret
+define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <8 x double>* %src, align 64
+  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: scatter_mask_dps_execdomain
+;CHECK: vmovaps
+;CHECK: vscatterdps
+;CHECK: ret
+define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <16 x float>* %src, align 64
+  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: scatter_mask_qps_execdomain
+;CHECK: vmovaps
+;CHECK: vscatterqps
+;CHECK: ret
+define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
+  %x = load <8 x float>* %src, align 32 
+  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 3f06740..6557ac3 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -44,7 +44,7 @@ define <8 x i64> @test4(<8 x i64> %x) nounwind {
 }
 
 ;CHECK-LABEL: test5:
-;CHECK: vextractpsz
+;CHECK: vextractps
 ;CHECK: ret
 define i32 @test5(<4 x float> %x) nounwind {
   %ef = extractelement <4 x float> %x, i32 3
@@ -53,7 +53,7 @@ define i32 @test5(<4 x float> %x) nounwind {
 }
 
 ;CHECK-LABEL: test6:
-;CHECK: vextractpsz {{.*}}, (%rdi)
+;CHECK: vextractps {{.*}}, (%rdi)
 ;CHECK: ret
 define void @test6(<4 x float> %x, float* %out) nounwind {
   %ef = extractelement <4 x float> %x, i32 3
@@ -62,7 +62,7 @@ define void @test6(<4 x float> %x, float* %out) nounwind {
 }
 
 ;CHECK-LABEL: test7
-;CHECK: vmovdz
+;CHECK: vmovd
 ;CHECK: vpermps %zmm
 ;CHECK: ret
 define float @test7(<16 x float> %x, i32 %ind) nounwind {
@@ -71,7 +71,7 @@ define float @test7(<16 x float> %x, i32 %ind) nounwind {
 }
 
 ;CHECK-LABEL: test8
-;CHECK: vmovqz
+;CHECK: vmovq
 ;CHECK: vpermpd %zmm
 ;CHECK: ret
 define double @test8(<8 x double> %x, i32 %ind) nounwind {
@@ -89,9 +89,9 @@ define float @test9(<8 x float> %x, i32 %ind) nounwind {
 }
 
 ;CHECK-LABEL: test10
-;CHECK: vmovdz
+;CHECK: vmovd
 ;CHECK: vpermd %zmm
-;CHEKK: vmovdz  %xmm0, %eax
+;CHECK: vmovd  %xmm0, %eax
 ;CHECK: ret
 define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
   %e = extractelement <16 x i32> %x, i32 %ind
@@ -99,27 +99,62 @@ define i32 @test10(<16 x i32> %x, i32 %ind) nounwind {
 }
 
 ;CHECK-LABEL: test11
-;CHECK: movl    $260
-;CHECK: bextrl
-;CHECK: movl    $268
-;CHECK: bextrl
+;CHECK: vpcmpltud
+;CHECK: kshiftlw $11
+;CHECK: kshiftrw $15
+;CHECK: kortestw
+;CHECK: je
+;CHECK: ret
 ;CHECK: ret
 define <16 x i32> @test11(<16 x i32>%a, <16 x i32>%b) {
   %cmp_res = icmp ult <16 x i32> %a, %b
   %ia = extractelement <16 x i1> %cmp_res, i32 4
-  %ib = extractelement <16 x i1> %cmp_res, i32 12
-
   br i1 %ia, label %A, label %B
-
   A:
     ret <16 x i32>%b
   B:
    %c = add <16 x i32>%b, %a
-  br i1 %ib, label %C, label %D
-  C:
-   %c1 = sub <16 x i32>%c, %a
-   ret <16 x i32>%c1
-  D:
-   %c2 = mul <16 x i32>%c, %a
-   ret <16 x i32>%c2
+   ret <16 x i32>%c
+}
+
+;CHECK-LABEL: test12
+;CHECK: vpcmpgtq
+;CHECK: kshiftlw $15
+;CHECK: kshiftrw $15
+;CHECK: kortestw
+;CHECK: ret
+
+define i64 @test12(<16 x i64>%a, <16 x i64>%b, i64 %a1, i64 %b1) {
+
+  %cmpvector_func.i = icmp slt <16 x i64> %a, %b
+  %extract24vector_func.i = extractelement <16 x i1> %cmpvector_func.i, i32 0
+  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
+  ret i64 %res
+}
+
+;CHECK-LABEL: test13
+;CHECK: cmpl
+;CHECK: sbbl
+;CHECK: orl $65532
+;CHECK: ret
+define i16 @test13(i32 %a, i32 %b) {
+  %cmp_res = icmp ult i32 %a, %b
+  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %cmp_res, i32 0
+  %res = bitcast <16 x i1> %maskv to i16
+  ret i16 %res
+}
+
+;CHECK-LABEL: test14
+;CHECK: vpcmpgtq
+;CHECK: kshiftlw $11
+;CHECK: kshiftrw $15
+;CHECK: kortestw
+;CHECK: ret
+
+define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
+
+  %cmpvector_func.i = icmp slt <8 x i64> %a, %b
+  %extract24vector_func.i = extractelement <8 x i1> %cmpvector_func.i, i32 4
+  %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
+  ret i64 %res
 }
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 5bdabf2..3fb38ed 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -1,108 +1,136 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
-declare i32 @llvm.x86.avx512.kortestz(i16, i16) nounwind readnone
-; CHECK: test_kortestz
+declare i32 @llvm.x86.avx512.kortestz.w(i16, i16) nounwind readnone
+; CHECK-LABEL: test_kortestz
 ; CHECK: kortestw
 ; CHECK: sete
 define i32 @test_kortestz(i16 %a0, i16 %a1) {
-  %res = call i32 @llvm.x86.avx512.kortestz(i16 %a0, i16 %a1) 
+  %res = call i32 @llvm.x86.avx512.kortestz.w(i16 %a0, i16 %a1) 
   ret i32 %res
 }
 
-declare i32 @llvm.x86.avx512.kortestc(i16, i16) nounwind readnone
-; CHECK: test_kortestc
+declare i32 @llvm.x86.avx512.kortestc.w(i16, i16) nounwind readnone
+; CHECK-LABEL: test_kortestc
 ; CHECK: kortestw
 ; CHECK: sbbl
 define i32 @test_kortestc(i16 %a0, i16 %a1) {
-  %res = call i32 @llvm.x86.avx512.kortestc(i16 %a0, i16 %a1) 
+  %res = call i32 @llvm.x86.avx512.kortestc.w(i16 %a0, i16 %a1) 
   ret i32 %res
 }
 
+declare i16 @llvm.x86.avx512.kand.w(i16, i16) nounwind readnone
+; CHECK-LABEL: test_kand
+; CHECK: kandw
+; CHECK: kandw
+define i16 @test_kand(i16 %a0, i16 %a1) {
+  %t1 = call i16 @llvm.x86.avx512.kand.w(i16 %a0, i16 8)
+  %t2 = call i16 @llvm.x86.avx512.kand.w(i16 %t1, i16 %a1)
+  ret i16 %t2
+}
+
+declare i16 @llvm.x86.avx512.knot.w(i16) nounwind readnone
+; CHECK-LABEL: test_knot
+; CHECK: knotw
+define i16 @test_knot(i16 %a0) {
+  %res = call i16 @llvm.x86.avx512.knot.w(i16 %a0)
+  ret i16 %res
+}
+
+declare i16 @llvm.x86.avx512.kunpck.bw(i16, i16) nounwind readnone
+
+; CHECK-LABEL: unpckbw_test
+; CHECK: kunpckbw
+; CHECK:ret
+define i16 @unpckbw_test(i16 %a0, i16 %a1) {
+  %res = call i16 @llvm.x86.avx512.kunpck.bw(i16 %a0, i16 %a1)
+  ret i16 %res
+}
+
 define <16 x float> @test_rcp_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp14ps
-  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrcp14ps {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x4c,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rcp14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
 define <8 x double> @test_rcp_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp14pd
-  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ; CHECK: vrcp14pd {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x4c,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.rcp14.pd.512(<8 x double>, <8 x double>, i8) nounwind readnone
 
 define <16 x float> @test_rcp28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrcp28ps
-  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrcp28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xca,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rcp28.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rcp28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <8 x double> @test_rcp28_pd_512(<8 x double> %a0) {
-  ; CHECK: vrcp28pd
-  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double> %a0) ; <<8 x double>> [#uses=1]
+  ; CHECK: vrcp28pd {sae}, {{.*}}encoding: [0x62,0xf2,0xfd,0x18,0xca,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double> %a0, <8 x double> zeroinitializer, i8 -1, i32 8) ; <<8 x double>> [#uses=1]
   ret <8 x double> %res
 }
-declare <8 x double> @llvm.x86.avx512.rcp28.pd.512(<8 x double>) nounwind readnone
+declare <8 x double> @llvm.x86.avx512.rcp28.pd(<8 x double>, <8 x double>, i8, i32) nounwind readnone
 
-define <8 x double> @test_rndscale_pd_512(<8 x double> %a0) {
-  ; CHECK: vrndscale
-  %res = call <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double> %a0, i32 7) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
+declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8 x double>, i8, i32)
+
+define <8 x double> @test7(<8 x double> %a) {
+; CHECK: vrndscalepd {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
+  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> zeroinitializer, i8 -1, i32 4)
+  ret <8 x double>%res
 }
-declare <8 x double> @llvm.x86.avx512.rndscale.pd.512(<8 x double>, i32) nounwind readnone
 
+declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <16 x float>, i16, i32)
 
-define <16 x float> @test_rndscale_ps_512(<16 x float> %a0) {
-  ; CHECK: vrndscale
-  %res = call <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float> %a0, i32 7) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
+define <16 x float> @test8(<16 x float> %a) {
+; CHECK: vrndscaleps {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
+  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> zeroinitializer, i16 -1, i32 4)
+  ret <16 x float>%res
 }
-declare <16 x float> @llvm.x86.avx512.rndscale.ps.512(<16 x float>, i32) nounwind readnone
-
 
 define <16 x float> @test_rsqrt_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt14ps
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrsqrt14ps {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x4e,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rsqrt14.ps.512(<16 x float>, <16 x float>, i16) nounwind readnone
 
 define <16 x float> @test_rsqrt28_ps_512(<16 x float> %a0) {
-  ; CHECK: vrsqrt28ps
-  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float> %a0) ; <<16 x float>> [#uses=1]
+  ; CHECK: vrsqrt28ps {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcc,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float> %a0, <16 x float> zeroinitializer, i16 -1, i32 8) ; <<16 x float>> [#uses=1]
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.rsqrt28.ps.512(<16 x float>) nounwind readnone
+declare <16 x float> @llvm.x86.avx512.rsqrt28.ps(<16 x float>, <16 x float>, i16, i32) nounwind readnone
 
 define <4 x float> @test_rsqrt14_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt14ss
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrsqrt14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4f,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_rsqrt28_ss(<4 x float> %a0) {
-  ; CHECK: vrsqrt28ss
-  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrsqrt28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcd,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rsqrt28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <4 x float> @test_rcp14_ss(<4 x float> %a0) {
-  ; CHECK: vrcp14ss
-  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrcp14ss {{.*}}encoding: [0x62,0xf2,0x7d,0x08,0x4d,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp14.ss(<4 x float>, <4 x float>, <4 x float>, i8) nounwind readnone
 
 define <4 x float> @test_rcp28_ss(<4 x float> %a0) {
-  ; CHECK: vrcp28ss
-  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0) ; <<4 x float>> [#uses=1]
+  ; CHECK: vrcp28ss {sae}, {{.*}}encoding: [0x62,0xf2,0x7d,0x18,0xcb,0xc0]
+  %res = call <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float> %a0, <4 x float> %a0, <4 x float> zeroinitializer, i8 -1, i32 8) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
-declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>) nounwind readnone
+declare <4 x float> @llvm.x86.avx512.rcp28.ss(<4 x float>, <4 x float>, <4 x float>, i8, i32) nounwind readnone
 
 define <8 x double> @test_sqrt_pd_512(<8 x double> %a0) {
   ; CHECK: vsqrtpd
@@ -119,42 +147,42 @@ define <16 x float> @test_sqrt_ps_512(<16 x float> %a0) {
 declare <16 x float> @llvm.x86.avx512.sqrt.ps.512(<16 x float>) nounwind readnone
 
 define <4 x float> @test_sqrt_ss(<4 x float> %a0, <4 x float> %a1) {
-  ; CHECK: vsqrtssz
+  ; CHECK: vsqrtss {{.*}}encoding: [0x62
   %res = call <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float> %a0, <4 x float> %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
 declare <4 x float> @llvm.x86.avx512.sqrt.ss(<4 x float>, <4 x float>) nounwind readnone
 
 define <2 x double> @test_sqrt_sd(<2 x double> %a0, <2 x double> %a1) {
-  ; CHECK: vsqrtsdz
+  ; CHECK: vsqrtsd {{.*}}encoding: [0x62
   %res = call <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double> %a0, <2 x double> %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx512.sqrt.sd(<2 x double>, <2 x double>) nounwind readnone
 
 define i64 @test_x86_sse2_cvtsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2siz
+  ; CHECK: vcvtsd2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse2.cvtsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.sse2.cvtsd2si64(<2 x double>) nounwind readnone
 
 define <2 x double> @test_x86_sse2_cvtsi642sd(<2 x double> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2sdqz
+  ; CHECK: vcvtsi2sdq {{.*}}encoding: [0x62
   %res = call <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.sse2.cvtsi642sd(<2 x double>, i64) nounwind readnone
 
 define <2 x double> @test_x86_avx512_cvtusi642sd(<2 x double> %a0, i64 %a1) {
-  ; CHECK: vcvtusi2sdqz
+  ; CHECK: vcvtusi2sdq {{.*}}encoding: [0x62
   %res = call <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double> %a0, i64 %a1) ; <<2 x double>> [#uses=1]
   ret <2 x double> %res
 }
 declare <2 x double> @llvm.x86.avx512.cvtusi642sd(<2 x double>, i64) nounwind readnone
 
 define i64 @test_x86_sse2_cvttsd2si64(<2 x double> %a0) {
-  ; CHECK: vcvttsd2siz
+  ; CHECK: vcvttsd2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse2.cvttsd2si64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
@@ -162,7 +190,7 @@ declare i64 @llvm.x86.sse2.cvttsd2si64(<2 x double>) nounwind readnone
 
 
 define i64 @test_x86_sse_cvtss2si64(<4 x float> %a0) {
-  ; CHECK: vcvtss2siz
+  ; CHECK: vcvtss2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse.cvtss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
@@ -170,7 +198,7 @@ declare i64 @llvm.x86.sse.cvtss2si64(<4 x float>) nounwind readnone
 
 
 define <4 x float> @test_x86_sse_cvtsi642ss(<4 x float> %a0, i64 %a1) {
-  ; CHECK: vcvtsi2ssqz
+  ; CHECK: vcvtsi2ssq {{.*}}encoding: [0x62
   %res = call <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float> %a0, i64 %a1) ; <<4 x float>> [#uses=1]
   ret <4 x float> %res
 }
@@ -178,33 +206,34 @@ declare <4 x float> @llvm.x86.sse.cvtsi642ss(<4 x float>, i64) nounwind readnone
 
 
 define i64 @test_x86_sse_cvttss2si64(<4 x float> %a0) {
-  ; CHECK: vcvttss2siz
+  ; CHECK: vcvttss2si {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.sse.cvttss2si64(<4 x float> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.sse.cvttss2si64(<4 x float>) nounwind readnone
 
 define i64 @test_x86_avx512_cvtsd2usi64(<2 x double> %a0) {
-  ; CHECK: vcvtsd2usiz
+  ; CHECK: vcvtsd2usi {{.*}}encoding: [0x62
   %res = call i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double> %a0) ; <i64> [#uses=1]
   ret i64 %res
 }
 declare i64 @llvm.x86.avx512.cvtsd2usi64(<2 x double>) nounwind readnone
 
 define <16 x float> @test_x86_vcvtph2ps_512(<16 x i16> %a0) {
-  ; CHECK: vcvtph2ps
-  %res = call <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16> %a0)
+  ; CHECK: vcvtph2ps  %ymm0, %zmm0    ## encoding: [0x62,0xf2,0x7d,0x48,0x13,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16> %a0, <16 x float> zeroinitializer, i16 -1, i32 4)
   ret <16 x float> %res
 }
-declare <16 x float> @llvm.x86.avx512.vcvtph2ps.512(<16 x i16>) nounwind readonly
+declare <16 x float> @llvm.x86.avx512.mask.vcvtph2ps.512(<16 x i16>, <16 x float>, i16, i32) nounwind readonly
 
 
 define <16 x i16> @test_x86_vcvtps2ph_256(<16 x float> %a0) {
-  ; CHECK: vcvtps2ph
-  %res = call <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float> %a0, i32 0)
+  ; CHECK: vcvtps2ph $2, %zmm0, %ymm0  ## encoding: [0x62,0xf3,0x7d,0x48,0x1d,0xc0,0x02]
+  %res = call <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float> %a0, i32 2, <16 x i16> zeroinitializer, i16 -1)
   ret <16 x i16> %res
 }
-declare <16 x i16> @llvm.x86.avx512.vcvtps2ph.512(<16 x float>, i32) nounwind readonly
+
+declare <16 x i16> @llvm.x86.avx512.mask.vcvtps2ph.512(<16 x float>, i32, <16 x i16>, i16) nounwind readonly
 
 define <16 x float> @test_x86_vbroadcast_ss_512(i8* %a0) {
   ; CHECK: vbroadcastss
@@ -262,113 +291,249 @@ define <8 x i64> @test_x86_pbroadcastq_i64_512(i64 %a0) {
 }
 declare <8 x i64> @llvm.x86.avx512.pbroadcastq.i64.512(i64) nounwind readonly
 
-define <16 x i32> @test_x86_pmaxu_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmaxud 
-  %res = call <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+define <16 x i32> @test_conflict_d(<16 x i32> %a) {
+  ; CHECK: movw $-1, %ax
+  ; CHECK: vpxor
+  ; CHECK: vpconflictd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pmaxu.d(<16 x i32>, <16 x i32>) nounwind readonly
 
-define <8 x i64> @test_x86_pmaxu_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpmaxuq
-  %res = call <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+declare <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
+
+define <8 x i64> @test_conflict_q(<8 x i64> %a) {
+  ; CHECK: movb $-1, %al
+  ; CHECK: vpxor
+  ; CHECK: vpconflictq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pmaxu.q(<8 x i64>, <8 x i64>) nounwind readonly
 
-define <16 x i32> @test_x86_pmaxs_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpmaxsd
-  %res = call <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+declare <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+
+define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
+  ; CHECK: vpconflictd 
+  %res = call <16 x i32> @llvm.x86.avx512.mask.conflict.d.512(<16 x i32> %a, <16 x i32> zeroinitializer, i16 %mask)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pmaxs.d(<16 x i32>, <16 x i32>) nounwind readonly
 
-define <8 x i64> @test_x86_pmaxs_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpmaxsq
-  %res = call <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
+  ; CHECK: vpconflictq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.conflict.q.512(<8 x i64> %a, <8 x i64> %b, i8 %mask)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pmaxs.q(<8 x i64>, <8 x i64>) nounwind readonly
 
-define <16 x i32> @test_x86_pminu_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpminud
-  %res = call <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
+define <16 x float> @test_x86_mask_blend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
+  ; CHECK: vblendmps
+  %res = call <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float> %a1, <16 x float> %a2, i16 %a0) ; <<16 x float>> [#uses=1]
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.blend.ps.512(<16 x float>, <16 x float>, i16) nounwind readonly
+
+define <8 x double> @test_x86_mask_blend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
+  ; CHECK: vblendmpd
+  %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a1, <8 x double> %a2, i8 %a0) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+
+define <8 x double> @test_x86_mask_blend_pd_512_memop(<8 x double> %a, <8 x double>* %ptr, i8 %mask) {
+  ; CHECK-LABEL: test_x86_mask_blend_pd_512_memop
+  ; CHECK: vblendmpd (%
+  %b = load <8 x double>* %ptr
+  %res = call <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double> %a, <8 x double> %b, i8 %mask) ; <<8 x double>> [#uses=1]
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.blend.pd.512(<8 x double>, <8 x double>, i8) nounwind readonly
+
+define <16 x i32> @test_x86_mask_blend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
+  ; CHECK: vpblendmd
+  %res = call <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32> %a1, <16 x i32> %a2, i16 %a0) ; <<16 x i32>> [#uses=1]
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pminu.d(<16 x i32>, <16 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.mask.blend.d.512(<16 x i32>, <16 x i32>, i16) nounwind readonly
 
-define <8 x i64> @test_x86_pminu_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpminuq
-  %res = call <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+define <8 x i64> @test_x86_mask_blend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
+  ; CHECK: vpblendmq
+  %res = call <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64> %a1, <8 x i64> %a2, i8 %a0) ; <<8 x i64>> [#uses=1]
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pminu.q(<8 x i64>, <8 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.mask.blend.q.512(<8 x i64>, <8 x i64>, i8) nounwind readonly
+
+ define <8 x i32> @test_cvtpd2udq(<8 x double> %a) {
+ ;CHECK: vcvtpd2udq {ru-sae}{{.*}}encoding: [0x62,0xf1,0xfc,0x58,0x79,0xc0]
+  %res = call <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double> %a, <8 x i32>zeroinitializer, i8 -1, i32 2)
+  ret <8 x i32>%res
+ }
+ declare <8 x i32> @llvm.x86.avx512.mask.cvtpd2udq.512(<8 x double>, <8 x i32>, i8, i32)
+ 
+ define <16 x i32> @test_cvtps2udq(<16 x float> %a) {
+ ;CHECK: vcvtps2udq {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x79,0xc0]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float> %a, <16 x i32>zeroinitializer, i16 -1, i32 1)
+  ret <16 x i32>%res
+ }
+ declare <16 x i32> @llvm.x86.avx512.mask.cvtps2udq.512(<16 x float>, <16 x i32>, i16, i32)
+
+ define i16 @test_cmpps(<16 x float> %a, <16 x float> %b) {
+ ;CHECK: vcmpleps {sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x18,0xc2,0xc1,0x02]
+   %res = call i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> %a, <16 x float> %b, i32 2, i16 -1, i32 8)
+   ret i16 %res
+ }
+ declare i16 @llvm.x86.avx512.mask.cmp.ps.512(<16 x float> , <16 x float> , i32, i16, i32)
+
+ define i8 @test_cmppd(<8 x double> %a, <8 x double> %b) {
+ ;CHECK: vcmpneqpd %zmm{{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xc2,0xc1,0x04]
+   %res = call i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> %a, <8 x double> %b, i32 4, i8 -1, i32 4)
+   ret i8 %res
+ }
+ declare i8 @llvm.x86.avx512.mask.cmp.pd.512(<8 x double> , <8 x double> , i32, i8, i32)
+
+ ; cvt intrinsics
+ define <16 x float> @test_cvtdq2ps(<16 x i32> %a) {
+ ;CHECK: vcvtdq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7c,0x38,0x5b,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float>%res
+ }
+ declare <16 x float> @llvm.x86.avx512.mask.cvtdq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+ define <16 x float> @test_cvtudq2ps(<16 x i32> %a) {
+ ;CHECK: vcvtudq2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0x7f,0x38,0x7a,0xc0]
+  %res = call <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32> %a, <16 x float>zeroinitializer, i16 -1, i32 1)
+  ret <16 x float>%res
+ }
+ declare <16 x float> @llvm.x86.avx512.mask.cvtudq2ps.512(<16 x i32>, <16 x float>, i16, i32)
+
+ define <8 x double> @test_cvtdq2pd(<8 x i32> %a) {
+ ;CHECK: vcvtdq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0xe6,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
+  ret <8 x double>%res
+ }
+ declare <8 x double> @llvm.x86.avx512.mask.cvtdq2pd.512(<8 x i32>, <8 x double>, i8)
+
+ define <8 x double> @test_cvtudq2pd(<8 x i32> %a) {
+ ;CHECK: vcvtudq2pd {{.*}}encoding: [0x62,0xf1,0x7e,0x48,0x7a,0xc0]
+  %res = call <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32> %a, <8 x double>zeroinitializer, i8 -1)
+  ret <8 x double>%res
+ }
+ declare <8 x double> @llvm.x86.avx512.mask.cvtudq2pd.512(<8 x i32>, <8 x double>, i8)
+
+ ; fp min - max
+define <16 x float> @test_vmaxps(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK: vmaxps
+  %res = call <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.max.ps.512(<16 x float>, <16 x float>,
+                    <16 x float>, i16, i32)
 
-define <16 x i32> @test_x86_pmins_d(<16 x i32> %a0, <16 x i32> %a1) {
-  ; CHECK: vpminsd
-  %res = call <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32> %a0, <16 x i32> %a1) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
+define <8 x double> @test_vmaxpd(<8 x double> %a0, <8 x double> %a1) {
+  ; CHECK: vmaxpd
+  %res = call <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double>zeroinitializer, i8 -1, i32 4)
+  ret <8 x double> %res
 }
-declare <16 x i32> @llvm.x86.avx512.pmins.d(<16 x i32>, <16 x i32>) nounwind readonly
+declare <8 x double> @llvm.x86.avx512.mask.max.pd.512(<8 x double>, <8 x double>,
+                    <8 x double>, i8, i32)
 
-define <8 x i64> @test_x86_pmins_q(<8 x i64> %a0, <8 x i64> %a1) {
-  ; CHECK: vpminsq
-  %res = call <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64> %a0, <8 x i64> %a1) ; <<8 x i64>> [#uses=1]
+define <16 x float> @test_vminps(<16 x float> %a0, <16 x float> %a1) {
+  ; CHECK: vminps
+  %res = call <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float> %a0, <16 x float> %a1,
+                    <16 x float>zeroinitializer, i16 -1, i32 4)
+  ret <16 x float> %res
+}
+declare <16 x float> @llvm.x86.avx512.mask.min.ps.512(<16 x float>, <16 x float>,
+                    <16 x float>, i16, i32)
+
+define <8 x double> @test_vminpd(<8 x double> %a0, <8 x double> %a1) {
+  ; CHECK: vminpd
+  %res = call <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double> %a0, <8 x double> %a1,
+                    <8 x double>zeroinitializer, i8 -1, i32 4)
+  ret <8 x double> %res
+}
+declare <8 x double> @llvm.x86.avx512.mask.min.pd.512(<8 x double>, <8 x double>,
+                    <8 x double>, i8, i32)
+
+ define <8 x float> @test_cvtpd2ps(<8 x double> %a) {
+ ;CHECK: vcvtpd2ps {rd-sae}{{.*}}encoding: [0x62,0xf1,0xfd,0x38,0x5a,0xc0]
+  %res = call <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double> %a, <8 x float>zeroinitializer, i8 -1, i32 1)
+  ret <8 x float>%res
+ }
+ declare <8 x float> @llvm.x86.avx512.mask.cvtpd2ps.512(<8 x double>, <8 x float>, i8, i32)
+
+ define <16 x i32> @test_pabsd(<16 x i32> %a) {
+ ;CHECK: vpabsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x1e,0xc0]
+ %res = call <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32> %a, <16 x i32>zeroinitializer, i16 -1)
+ ret < 16 x i32> %res
+ }
+ declare <16 x i32> @llvm.x86.avx512.mask.pabs.d.512(<16 x i32>, <16 x i32>, i16)
+
+ define <8 x i64> @test_pabsq(<8 x i64> %a) {
+ ;CHECK: vpabsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x1f,0xc0]
+ %res = call <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64> %a, <8 x i64>zeroinitializer, i8 -1)
+ ret <8 x i64> %res
+ }
+ declare <8 x i64> @llvm.x86.avx512.mask.pabs.q.512(<8 x i64>, <8 x i64>, i8)
+
+define <8 x i64> @test_vpmaxq(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vpmaxsq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x3d,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64> %a0, <8 x i64> %a1,
+                    <8 x i64>zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.pmins.q(<8 x i64>, <8 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.mask.pmaxs.q.512(<8 x i64>, <8 x i64>, <8 x i64>, i8)
 
-define <16 x i32> @test_conflict_d(<16 x i32> %a) {
-  ; CHECK: vpconflictd
-  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32> %a)
+define <16 x i32> @test_vpminud(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpminud {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3b,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32> %a0, <16 x i32> %a1,
+                    <16 x i32>zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.conflict.d.512(<16 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.mask.pminu.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <16 x i32> @test_maskz_conflict_d(<16 x i32> %a, i16 %mask) {
-  ; CHECK: vpconflictd %zmm0, %zmm0 {%k1} {z}
-  %vmask = bitcast i16 %mask to <16 x i1>
-  %res = call <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1> %vmask, <16 x i32> %a)
+define <16 x i32> @test_vpmaxsd(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmaxsd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x3d,0xc1]
+  %res = call <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32> %a0, <16 x i32> %a1,
+                    <16 x i32>zeroinitializer, i16 -1)
   ret <16 x i32> %res
 }
-declare <16 x i32> @llvm.x86.avx512.conflict.d.maskz.512(<16 x i1>,<16 x i32>) nounwind readonly
+declare <16 x i32> @llvm.x86.avx512.mask.pmaxs.d.512(<16 x i32>, <16 x i32>, <16 x i32>, i16)
 
-define <8 x i64> @test_mask_conflict_q(<8 x i64> %a, <8 x i64> %b, i8 %mask) {
-  ; CHECK: vpconflictq {{.*}} {%k1}
-  %vmask = bitcast i8 %mask to <8 x i1>
-  %res = call <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64> %b, <8 x i1> %vmask, <8 x i64> %a)
+define <8 x i64> @test_vpmuludq(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vpmuludq {{.*}}encoding: [0x62,0xf1,0xfd,0x48,0xf4,0xc1]
+  %res = call <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32> %a0, <16 x i32> %a1,
+                    <8 x i64>zeroinitializer, i8 -1)
   ret <8 x i64> %res
 }
-declare <8 x i64> @llvm.x86.avx512.conflict.q.mask.512(<8 x i64>, <8 x i1>,<8 x i64>) nounwind readonly
+declare <8 x i64> @llvm.x86.avx512.mask.pmulu.dq.512(<16 x i32>, <16 x i32>, <8 x i64>, i8)
 
-define <16 x float> @test_x86_mskblend_ps_512(i16 %a0, <16 x float> %a1, <16 x float> %a2) {
-  ; CHECK: vblendmps
-  %m0 = bitcast i16 %a0 to <16 x i1>
-  %res = call <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %m0, <16 x float> %a1, <16 x float> %a2) ; <<16 x float>> [#uses=1]
-  ret <16 x float> %res
+define i8 @test_vptestmq(<8 x i64> %a0, <8 x i64> %a1) {
+  ; CHECK: vptestmq {{.*}}encoding: [0x62,0xf2,0xfd,0x48,0x27,0xc1]
+  %res = call i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64> %a0, <8 x i64> %a1, i8 -1)
+  ret i8 %res
 }
-declare <16 x float> @llvm.x86.avx512.mskblend.ps.512(<16 x i1> %a0, <16 x float> %a1, <16 x float> %a2) nounwind readonly
+declare i8 @llvm.x86.avx512.mask.ptestm.q.512(<8 x i64>, <8 x i64>, i8)
 
-define <8 x double> @test_x86_mskblend_pd_512(i8 %a0, <8 x double> %a1, <8 x double> %a2) {
-  ; CHECK: vblendmpd
-  %m0 = bitcast i8 %a0 to <8 x i1>
-  %res = call <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %m0, <8 x double> %a1, <8 x double> %a2) ; <<8 x double>> [#uses=1]
-  ret <8 x double> %res
+define i16 @test_vptestmd(<16 x i32> %a0, <16 x i32> %a1) {
+  ; CHECK: vptestmd {{.*}}encoding: [0x62,0xf2,0x7d,0x48,0x27,0xc1]
+  %res = call i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32> %a0, <16 x i32> %a1, i16 -1)
+  ret i16 %res
 }
-declare <8 x double> @llvm.x86.avx512.mskblend.pd.512(<8 x i1> %a0, <8 x double> %a1, <8 x double> %a2) nounwind readonly
+declare i16 @llvm.x86.avx512.mask.ptestm.d.512(<16 x i32>, <16 x i32>, i16)
 
-define <16 x i32> @test_x86_mskblend_d_512(i16 %a0, <16 x i32> %a1, <16 x i32> %a2) {
-  ; CHECK: vpblendmd
-  %m0 = bitcast i16 %a0 to <16 x i1>
-  %res = call <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %m0, <16 x i32> %a1, <16 x i32> %a2) ; <<16 x i32>> [#uses=1]
-  ret <16 x i32> %res
+define void @test_store1(<16 x float> %data, i8* %ptr, i16 %mask) {
+; CHECK: vmovups {{.*}}encoding: [0x62,0xf1,0x7c,0x49,0x11,0x07]
+  call void @llvm.x86.avx512.mask.storeu.ps.512(i8* %ptr, <16 x float> %data, i16 %mask)
+  ret void
 }
-declare <16 x i32> @llvm.x86.avx512.mskblend.d.512(<16 x i1> %a0, <16 x i32> %a1, <16 x i32> %a2) nounwind readonly
 
-define <8 x i64> @test_x86_mskblend_q_512(i8 %a0, <8 x i64> %a1, <8 x i64> %a2) {
-  ; CHECK: vpblendmq
-  %m0 = bitcast i8 %a0 to <8 x i1>
-  %res = call <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %m0, <8 x i64> %a1, <8 x i64> %a2) ; <<8 x i64>> [#uses=1]
-  ret <8 x i64> %res
+declare void @llvm.x86.avx512.mask.storeu.ps.512(i8*, <16 x float>, i16 )
+
+define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
+; CHECK: vmovupd {{.*}}encoding: [0x62,0xf1,0xfd,0x49,0x11,0x07]
+  call void @llvm.x86.avx512.mask.storeu.pd.512(i8* %ptr, <8 x double> %data, i8 %mask)
+  ret void
 }
-declare <8 x i64> @llvm.x86.avx512.mskblend.q.512(<8 x i1> %a0, <8 x i64> %a1, <8 x i64> %a2) nounwind readonly
+
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+\ No newline at end of file
diff --git a/test/CodeGen/X86/avx512-mask-op.ll b/test/CodeGen/X86/avx512-mask-op.ll
index ef5cb56..dd33ffd 100644
--- a/test/CodeGen/X86/avx512-mask-op.ll
+++ b/test/CodeGen/X86/avx512-mask-op.ll
@@ -33,19 +33,6 @@ define i16 @mand16(i16 %x, i16 %y) {
   ret i16 %ret
 }
 
-; CHECK: unpckbw_test
-; CHECK: kunpckbw
-; CHECK:ret
-declare <16 x i1> @llvm.x86.kunpck.v16i1(<8 x i1>, <8 x i1>) nounwind readnone
-
-define i16 @unpckbw_test(i8 %x, i8 %y) {
-  %m0 = bitcast i8 %x to <8 x i1>
-  %m1 = bitcast i8 %y to <8 x i1>
-  %k = tail call <16 x i1> @llvm.x86.kunpck.v16i1(<8 x i1> %m0, <8 x i1> %m1)
-  %r = bitcast <16 x i1> %k to i16
-  ret i16 %r
-}
-
 ; CHECK: shuf_test1
 ; CHECK: kshiftrw        $8
 ; CHECK:ret
@@ -55,3 +42,39 @@ define i8 @shuf_test1(i16 %v) nounwind {
    %mask1 = bitcast <8 x i1> %mask to i8
    ret i8 %mask1
 }
+
+; CHECK: zext_test1
+; CHECK: kshiftlw
+; CHECK: kshiftrw
+; CHECK: kmovw
+; CHECK:ret
+define i32 @zext_test1(<16 x i32> %a, <16 x i32> %b) {
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i32
+  ret i32 %res
+}
+
+; CHECK: zext_test2
+; CHECK: kshiftlw
+; CHECK: kshiftrw
+; CHECK: kmovw
+; CHECK:ret
+define i16 @zext_test2(<16 x i32> %a, <16 x i32> %b) {
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i16
+  ret i16 %res
+}
+
+; CHECK: zext_test3
+; CHECK: kshiftlw
+; CHECK: kshiftrw
+; CHECK: kmovw
+; CHECK:ret
+define i8 @zext_test3(<16 x i32> %a, <16 x i32> %b) {
+  %cmp_res = icmp ugt <16 x i32> %a, %b
+  %cmp_res.i1 = extractelement <16 x i1> %cmp_res, i32 5
+  %res = zext i1 %cmp_res.i1 to i8
+  ret i8 %res
+}
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 91242b1..13e6843 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -1,7 +1,7 @@
-; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
 ; CHECK-LABEL: @test1
-; CHECK: vmovdz  %xmm0, %eax
+; CHECK: vmovd  %xmm0, %eax ## encoding: [0x62
 ; CHECK: ret
 define i32 @test1(float %x) {
    %res = bitcast float %x to i32
@@ -9,7 +9,7 @@ define i32 @test1(float %x) {
 }
 
 ; CHECK-LABEL: @test2
-; CHECK: vmovdz  %edi
+; CHECK: vmovd  %edi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test2(i32 %x) {
    %res = insertelement <4 x i32>undef, i32 %x, i32 0
@@ -17,7 +17,7 @@ define <4 x i32> @test2(i32 %x) {
 }
 
 ; CHECK-LABEL: @test3
-; CHECK: vmovqz  %rdi
+; CHECK: vmovq  %rdi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <2 x i64> @test3(i64 %x) {
    %res = insertelement <2 x i64>undef, i64 %x, i32 0
@@ -25,7 +25,7 @@ define <2 x i64> @test3(i64 %x) {
 }
 
 ; CHECK-LABEL: @test4
-; CHECK: vmovdz  (%rdi)
+; CHECK: vmovd  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test4(i32* %x) {
    %y = load i32* %x
@@ -34,7 +34,7 @@ define <4 x i32> @test4(i32* %x) {
 }
 
 ; CHECK-LABEL: @test5
-; CHECK: vmovssz  %xmm0, (%rdi)
+; CHECK: vmovss  %xmm0, (%rdi) ## encoding: [0x62
 ; CHECK: ret
 define void @test5(float %x, float* %y) {
    store float %x, float* %y, align 4
@@ -42,7 +42,7 @@ define void @test5(float %x, float* %y) {
 }
 
 ; CHECK-LABEL: @test6
-; CHECK: vmovsdz  %xmm0, (%rdi)
+; CHECK: vmovsd  %xmm0, (%rdi) ## encoding: [0x62
 ; CHECK: ret
 define void @test6(double %x, double* %y) {
    store double %x, double* %y, align 8
@@ -50,7 +50,7 @@ define void @test6(double %x, double* %y) {
 }
 
 ; CHECK-LABEL: @test7
-; CHECK: vmovssz  (%rdi), %xmm0
+; CHECK: vmovss  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define float @test7(i32* %x) {
    %y = load i32* %x
@@ -59,7 +59,7 @@ define float @test7(i32* %x) {
 }
 
 ; CHECK-LABEL: @test8
-; CHECK: vmovdz %xmm0, %eax
+; CHECK: vmovd %xmm0, %eax ## encoding: [0x62
 ; CHECK: ret
 define i32 @test8(<4 x i32> %x) {
    %res = extractelement <4 x i32> %x, i32 0
@@ -67,7 +67,7 @@ define i32 @test8(<4 x i32> %x) {
 }
 
 ; CHECK-LABEL: @test9
-; CHECK: vmovqz %xmm0, %rax
+; CHECK: vmovq %xmm0, %rax ## encoding: [0x62
 ; CHECK: ret
 define i64 @test9(<2 x i64> %x) {
    %res = extractelement <2 x i64> %x, i32 0
@@ -75,7 +75,7 @@ define i64 @test9(<2 x i64> %x) {
 }
 
 ; CHECK-LABEL: @test10
-; CHECK: vmovdz  (%rdi)
+; CHECK: vmovd (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test10(i32* %x) {
    %y = load i32* %x, align 4
@@ -84,7 +84,7 @@ define <4 x i32> @test10(i32* %x) {
 }
 
 ; CHECK-LABEL: @test11
-; CHECK: vmovssz  (%rdi)
+; CHECK: vmovss  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x float> @test11(float* %x) {
    %y = load float* %x, align 4
@@ -93,7 +93,7 @@ define <4 x float> @test11(float* %x) {
 }
 
 ; CHECK-LABEL: @test12
-; CHECK: vmovsdz  (%rdi)
+; CHECK: vmovsd  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <2 x double> @test12(double* %x) {
    %y = load double* %x, align 8
@@ -102,7 +102,7 @@ define <2 x double> @test12(double* %x) {
 }
 
 ; CHECK-LABEL: @test13
-; CHECK: vmovqz  %rdi
+; CHECK: vmovq  %rdi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <2 x i64> @test13(i64 %x) {
    %res = insertelement <2 x i64>zeroinitializer, i64 %x, i32 0
@@ -110,7 +110,7 @@ define <2 x i64> @test13(i64 %x) {
 }
 
 ; CHECK-LABEL: @test14
-; CHECK: vmovdz  %edi
+; CHECK: vmovd  %edi, %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test14(i32 %x) {
    %res = insertelement <4 x i32>zeroinitializer, i32 %x, i32 0
@@ -118,7 +118,7 @@ define <4 x i32> @test14(i32 %x) {
 }
 
 ; CHECK-LABEL: @test15
-; CHECK: vmovdz  (%rdi)
+; CHECK: vmovd  (%rdi), %xmm0 ## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test15(i32* %x) {
    %y = load i32* %x, align 4
diff --git a/test/CodeGen/X86/avx512-select.ll b/test/CodeGen/X86/avx512-select.ll
index d2d6681..83f4698 100644
--- a/test/CodeGen/X86/avx512-select.ll
+++ b/test/CodeGen/X86/avx512-select.ll
@@ -20,3 +20,22 @@ define <8 x i64> @select01(i32 %a, <8 x i64> %b) nounwind {
   ret <8 x i64> %res
 }
 
+; CHECK-LABEL: @select02
+; CHECK: cmpless %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovss  %xmm2, {{.*}}%xmm1 {%k1}
+; CHECK: ret
+define float @select02(float %a, float %b, float %c, float %eps) {
+  %cmp = fcmp oge float %a, %eps
+  %cond = select i1 %cmp, float %c, float %b
+  ret float %cond
+}
+
+; CHECK-LABEL: @select03
+; CHECK: cmplesd %xmm0, %xmm3, %k1
+; CHECK-NEXT: vmovsd  %xmm2, {{.*}}%xmm1 {%k1}
+; CHECK: ret
+define double @select03(double %a, double %b, double %c, double %eps) {
+  %cmp = fcmp oge double %a, %eps
+  %cond = select i1 %cmp, double %c, double %b
+  ret double %cond
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
index c9e0c2b..59d7010 100644
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 ; CHECK: LCP
 ; CHECK: .long 2
 ; CHECK: .long 5
@@ -49,7 +49,7 @@ define <8 x double> @test4(<8 x double> %a) nounwind {
 }
 
 ; CHECK-LABEL: test5:
-; CHECK: vpermi2pd
+; CHECK: vpermt2pd
 ; CHECK: ret
 define <8 x double> @test5(<8 x double> %a, <8 x double> %b) nounwind {
   %c = shufflevector <8 x double> %a, <8 x double> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
@@ -65,7 +65,7 @@ define <8 x i64> @test6(<8 x i64> %a) nounwind {
 }
 
 ; CHECK-LABEL: test7:
-; CHECK: vpermi2q
+; CHECK: vpermt2q
 ; CHECK: ret
 define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
   %c = shufflevector <8 x i64> %a, <8 x i64> %b, <8 x i32> <i32 2, i32 8, i32 0, i32 1, i32 6, i32 10, i32 4, i32 5>
@@ -73,7 +73,7 @@ define <8 x i64> @test7(<8 x i64> %a, <8 x i64> %b) nounwind {
 }
 
 ; CHECK-LABEL: test8:
-; CHECK: vpermi2d
+; CHECK: vpermt2d
 ; CHECK: ret
 define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -81,7 +81,7 @@ define <16 x i32> @test8(<16 x i32> %a, <16 x i32> %b) nounwind {
 }
 
 ; CHECK-LABEL: test9:
-; CHECK: vpermi2ps
+; CHECK: vpermt2ps
 ; CHECK: ret
 define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
   %c = shufflevector <16 x float> %a, <16 x float> %b, <16 x i32> <i32 15, i32 31, i32 14, i32 22, i32 13, i32 29, i32 4, i32 28, i32 11, i32 27, i32 10, i32 26, i32 9, i32 25, i32 8, i32 24>
@@ -89,7 +89,7 @@ define <16 x float> @test9(<16 x float> %a, <16 x float> %b) nounwind {
 }
 
 ; CHECK-LABEL: test10:
-; CHECK: vpermi2ps (
+; CHECK: vpermt2ps (
 ; CHECK: ret
 define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
   %c = load <16 x float>* %b
@@ -98,7 +98,7 @@ define <16 x float> @test10(<16 x float> %a, <16 x float>* %b) nounwind {
 }
 
 ; CHECK-LABEL: test11:
-; CHECK: vpermi2d (
+; CHECK: vpermt2d 
 ; CHECK: ret
 define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
   %c = load <16 x i32>* %b
@@ -107,7 +107,7 @@ define <16 x i32> @test11(<16 x i32> %a, <16 x i32>* %b) nounwind {
 }
 
 ; CHECK-LABEL: test12
-; CHECK: vmovlhpsz %xmm
+; CHECK: vmovlhps {{.*}}## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) nounwind {
   %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
@@ -186,7 +186,7 @@ define <16 x float> @test21(<16 x float> %a, <16 x float> %c) {
 }
 
 ; CHECK-LABEL: test22
-; CHECK: vmovhlpsz %xmm
+; CHECK: vmovhlps {{.*}}## encoding: [0x62
 ; CHECK: ret
 define <4 x i32> @test22(<4 x i32> %a, <4 x i32> %b) nounwind {
   %c = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 2, i32 3, i32 6, i32 7>
@@ -202,7 +202,7 @@ define <16 x float> @test23(<16 x float> %a, <16 x float> %c) {
 }
 
 ; CHECK-LABEL: @test24
-; CHECK: vpermi2d
+; CHECK: vpermt2d
 ; CHECK: ret
 define <16 x i32> @test24(<16 x i32> %a, <16 x i32> %b) nounwind {
   %c = shufflevector <16 x i32> %a, <16 x i32> %b, <16 x i32> <i32 0, i32 1, i32 2, i32 19, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
@@ -223,4 +223,11 @@ define <16 x i32> @test25(<16 x i32> %a, <16 x i32> %b) nounwind {
 define <16 x i32> @test26(<16 x i32> %a) nounwind {
   %c = shufflevector <16 x i32> %a, <16 x i32> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 undef, i32 9, i32 9, i32 undef, i32 11, i32 13, i32 undef, i32 undef, i32 undef>
   ret <16 x i32> %c
-}
-\ No newline at end of file
+}
+
+; CHECK-LABEL: @test27
+; CHECK: ret
+define <16 x i32> @test27(<4 x i32>%a) {
+ %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+ ret <16 x i32> %res
+}
diff --git a/test/CodeGen/X86/avx512-trunc-ext.ll b/test/CodeGen/X86/avx512-trunc-ext.ll
index 31db68c..5e097be 100644
--- a/test/CodeGen/X86/avx512-trunc-ext.ll
+++ b/test/CodeGen/X86/avx512-trunc-ext.ll
@@ -18,7 +18,7 @@ define <8 x i16> @trunc_8x64_to_8x16(<8 x i64> %i) nounwind readnone {
 
 
 ; CHECK-LABEL: zext_16x8_to_16x32
-; CHECK; vpmovzxbd {{.*}}%zmm
+; CHECK: vpmovzxbd {{.*}}%zmm
 ; CHECK: ret
 define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
   %x = zext <16 x i8> %i to <16 x i32>
@@ -26,7 +26,7 @@ define <16 x i32> @zext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
 }
 
 ; CHECK-LABEL: sext_16x8_to_16x32
-; CHECK; vpmovsxbd {{.*}}%zmm
+; CHECK: vpmovsxbd {{.*}}%zmm
 ; CHECK: ret
 define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
   %x = sext <16 x i8> %i to <16 x i32>
@@ -35,7 +35,7 @@ define <16 x i32> @sext_16x8_to_16x32(<16 x i8> %i) nounwind readnone {
 
 
 ; CHECK-LABEL: zext_16x16_to_16x32
-; CHECK; vpmovzxwd {{.*}}%zmm
+; CHECK: vpmovzxwd {{.*}}%zmm
 ; CHECK: ret
 define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %i) nounwind readnone {
   %x = zext <16 x i16> %i to <16 x i32>
@@ -43,7 +43,7 @@ define <16 x i32> @zext_16x16_to_16x32(<16 x i16> %i) nounwind readnone {
 }
 
 ; CHECK-LABEL: zext_8x16_to_8x64
-; CHECK; vpmovzxwq
+; CHECK: vpmovzxwq
 ; CHECK: ret
 define <8 x i64> @zext_8x16_to_8x64(<8 x i16> %i) nounwind readnone {
   %x = zext <8 x i16> %i to <8 x i64>
@@ -116,7 +116,7 @@ define i8 @trunc_8i16_to_8i1(<8 x i16> %a) {
   ret i8 %mask
 }
 
-; CHECK: sext_8i1_8i32
+; CHECK-LABEL: sext_8i1_8i32
 ; CHECK: vpbroadcastq  LCP{{.*}}(%rip), %zmm0 {%k1} {z}
 ; CHECK: ret
 define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
@@ -125,3 +125,24 @@ define <8 x i32> @sext_8i1_8i32(<8 x i32> %a1, <8 x i32> %a2) nounwind {
   %y = sext <8 x i1> %x1 to <8 x i32>
   ret <8 x i32> %y
 }
+
+; CHECK-LABEL: trunc_v16i32_to_v16i16
+; CHECK: vpmovdw
+; CHECK: ret
+define <16 x i16> @trunc_v16i32_to_v16i16(<16 x i32> %x) {
+  %1 = trunc <16 x i32> %x to <16 x i16>
+  ret <16 x i16> %1
+}
+
+; CHECK-LABEL: trunc_i32_to_i1
+; CHECK: andl
+; CHECK: kmov
+; CHECK: kortest
+; CKECK: orl
+; CHECK: ret
+define i16 @trunc_i32_to_i1(i32 %a) {
+  %a_i = trunc i32 %a to i1
+  %maskv = insertelement <16 x i1> <i1 true, i1 false, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, i1 %a_i, i32 0
+  %res = bitcast <16 x i1> %maskv to i16
+  ret i16 %res
+}
diff --git a/test/CodeGen/X86/avx512-vbroadcast.ll b/test/CodeGen/X86/avx512-vbroadcast.ll
index 6f89d6c..9c6db11 100644
--- a/test/CodeGen/X86/avx512-vbroadcast.ll
+++ b/test/CodeGen/X86/avx512-vbroadcast.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl --show-mc-encoding| FileCheck %s
 
 ;CHECK-LABEL: _inreg16xi32:
 ;CHECK: vpbroadcastd {{.*}}, %zmm
@@ -19,7 +19,7 @@ define   <8 x i64> @_inreg8xi64(i64 %a) {
 }
 
 ;CHECK-LABEL: _inreg16xfloat:
-;CHECK: vbroadcastssz {{.*}}, %zmm
+;CHECK: vbroadcastss {{.*}}, %zmm
 ;CHECK: ret
 define   <16 x float> @_inreg16xfloat(float %a) {
   %b = insertelement <16 x float> undef, float %a, i32 0
@@ -28,7 +28,7 @@ define   <16 x float> @_inreg16xfloat(float %a) {
 }
 
 ;CHECK-LABEL: _inreg8xdouble:
-;CHECK: vbroadcastsdz {{.*}}, %zmm
+;CHECK: vbroadcastsd {{.*}}, %zmm
 ;CHECK: ret
 define   <8 x double> @_inreg8xdouble(double %a) {
   %b = insertelement <8 x double> undef, double %a, i32 0
@@ -45,9 +45,20 @@ define   <16 x i32> @_xmm16xi32(<16 x i32> %a) {
 }
 
 ;CHECK-LABEL: _xmm16xfloat
-;CHECK: vbroadcastssz
+;CHECK: vbroadcastss {{.*}}## encoding: [0x62
 ;CHECK: ret
 define   <16 x float> @_xmm16xfloat(<16 x float> %a) {
   %b = shufflevector <16 x float> %a, <16 x float> undef, <16 x i32> zeroinitializer
   ret <16 x float> %b
 }
+
+define <16 x i32> @test_vbroadcast() {
+  ; CHECK: vpbroadcastd
+entry:
+  %0 = sext <16 x i1> zeroinitializer to <16 x i32>
+  %1 = fcmp uno <16 x float> undef, zeroinitializer
+  %2 = sext <16 x i1> %1 to <16 x i32>
+  %3 = select <16 x i1> %1, <16 x i32> %0, <16 x i32> %2
+  ret <16 x i32> %3
+}
+
diff --git a/test/CodeGen/X86/avx512-vec-cmp.ll b/test/CodeGen/X86/avx512-vec-cmp.ll
index 6ca5bcc..d762f00 100644
--- a/test/CodeGen/X86/avx512-vec-cmp.ll
+++ b/test/CodeGen/X86/avx512-vec-cmp.ll
@@ -5,9 +5,9 @@
 ; CHECK: vmovups
 ; CHECK: ret
 define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
-	%mask = fcmp ole <16 x float> %x, %y
-	%max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
-	ret <16 x float> %max
+  %mask = fcmp ole <16 x float> %x, %y
+  %max = select <16 x i1> %mask, <16 x float> %x, <16 x float> %y
+  ret <16 x float> %max
 }
 
 ; CHECK-LABEL: test2
@@ -15,9 +15,9 @@ define <16 x float> @test1(<16 x float> %x, <16 x float> %y) nounwind {
 ; CHECK: vmovupd
 ; CHECK: ret
 define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
-	%mask = fcmp ole <8 x double> %x, %y
-	%max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
-	ret <8 x double> %max
+  %mask = fcmp ole <8 x double> %x, %y
+  %max = select <8 x i1> %mask, <8 x double> %x, <8 x double> %y
+  ret <8 x double> %max
 }
 
 ; CHECK-LABEL: test3
@@ -26,9 +26,9 @@ define <8 x double> @test2(<8 x double> %x, <8 x double> %y) nounwind {
 ; CHECK: ret
 define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwind {
   %y = load <16 x i32>* %yp, align 4
-	%mask = icmp eq <16 x i32> %x, %y
-	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
-	ret <16 x i32> %max
+  %mask = icmp eq <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %x1
+  ret <16 x i32> %max
 }
 
 ; CHECK-LABEL: @test4_unsigned
@@ -36,9 +36,9 @@ define <16 x i32> @test3(<16 x i32> %x, <16 x i32> %x1, <16 x i32>* %yp) nounwin
 ; CHECK: vmovdqu32
 ; CHECK: ret
 define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
-	%mask = icmp uge <16 x i32> %x, %y
-	%max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
-	ret <16 x i32> %max
+  %mask = icmp uge <16 x i32> %x, %y
+  %max = select <16 x i1> %mask, <16 x i32> %x, <16 x i32> %y
+  ret <16 x i32> %max
 }
 
 ; CHECK-LABEL: test5
@@ -46,9 +46,9 @@ define <16 x i32> @test4_unsigned(<16 x i32> %x, <16 x i32> %y) nounwind {
 ; CHECK: vmovdqu64 {{.*}}%k1
 ; CHECK: ret
 define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
-	%mask = icmp eq <8 x i64> %x, %y
-	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
-	ret <8 x i64> %max
+  %mask = icmp eq <8 x i64> %x, %y
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+  ret <8 x i64> %max
 }
 
 ; CHECK-LABEL: test6_unsigned
@@ -56,9 +56,9 @@ define <8 x i64> @test5(<8 x i64> %x, <8 x i64> %y) nounwind {
 ; CHECK: vmovdqu64 {{.*}}%k1
 ; CHECK: ret
 define <8 x i64> @test6_unsigned(<8 x i64> %x, <8 x i64> %y) nounwind {
-	%mask = icmp ugt <8 x i64> %x, %y
-	%max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
-	ret <8 x i64> %max
+  %mask = icmp ugt <8 x i64> %x, %y
+  %max = select <8 x i1> %mask, <8 x i64> %x, <8 x i64> %y
+  ret <8 x i64> %max
 }
 
 ; CHECK-LABEL: test7
@@ -111,3 +111,54 @@ define <8 x i32> @test11_unsigned(<8 x i32> %x, <8 x i32> %y) nounwind {
   %max = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
   ret <8 x i32> %max
 }
+
+; CHECK-LABEL: test12
+; CHECK: vpcmpeqq        %zmm2, %zmm0, [[LO:%k[0-7]]]
+; CHECK: vpcmpeqq        %zmm3, %zmm1, [[HI:%k[0-7]]]
+; CHECK: kunpckbw        [[LO]], [[HI]], {{%k[0-7]}}
+
+define i16 @test12(<16 x i64> %a, <16 x i64> %b) nounwind {
+  %res = icmp eq <16 x i64> %a, %b
+  %res1 = bitcast <16 x i1> %res to i16
+  ret i16 %res1
+}
+
+; CHECK-LABEL: test13
+; CHECK: vcmpeqps        %zmm
+; CHECK: vpbroadcastd
+; CHECK: ret
+define <16 x i32> @test13(<16 x float>%a, <16 x float>%b)
+{
+  %cmpvector_i = fcmp oeq <16 x float> %a, %b
+  %conv = zext <16 x i1> %cmpvector_i to <16 x i32>
+  ret <16 x i32> %conv
+}
+
+; CHECK-LABEL: test14
+; CHECK: vpcmp
+; CHECK-NOT: vpcmp
+; CHECK: vmovdqu32 {{.*}}{%k1} {z}
+; CHECK: ret
+define <16 x i32> @test14(<16 x i32>%a, <16 x i32>%b) {
+  %sub_r = sub <16 x i32> %a, %b
+  %cmp.i2.i = icmp sgt <16 x i32> %sub_r, %a
+  %sext.i3.i = sext <16 x i1> %cmp.i2.i to <16 x i32>
+  %mask = icmp eq <16 x i32> %sext.i3.i, zeroinitializer
+  %res = select <16 x i1> %mask, <16 x i32> zeroinitializer, <16 x i32> %sub_r
+  ret <16 x i32>%res
+}
+
+; CHECK-LABEL: test15
+; CHECK: vpcmpgtq
+; CHECK-NOT: vpcmp
+; CHECK: vmovdqu64 {{.*}}{%k1} {z}
+; CHECK: ret
+define <8 x i64> @test15(<8 x i64>%a, <8 x i64>%b) {
+  %sub_r = sub <8 x i64> %a, %b
+  %cmp.i2.i = icmp sgt <8 x i64> %sub_r, %a
+  %sext.i3.i = sext <8 x i1> %cmp.i2.i to <8 x i64>
+  %mask = icmp eq <8 x i64> %sext.i3.i, zeroinitializer
+  %res = select <8 x i1> %mask, <8 x i64> zeroinitializer, <8 x i64> %sub_r
+  ret <8 x i64>%res
+}
+
diff --git a/test/CodeGen/X86/avx512-vselect-crash.ll b/test/CodeGen/X86/avx512-vselect-crash.ll
new file mode 100644
index 0000000..9d652d3
--- /dev/null
+++ b/test/CodeGen/X86/avx512-vselect-crash.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+; CHECK-LABEL: test
+; CHECK: vpxord
+; CHECK: ret
+define <16 x i32> @test() {
+entry:
+  %0 = icmp slt <16 x i32> undef, undef
+  %1 = select <16 x i1> %0, <16 x i32> undef, <16 x i32> zeroinitializer
+  ret <16 x i32> %1
+}
diff --git a/test/CodeGen/X86/avx512-zext-load-crash.ll b/test/CodeGen/X86/avx512-zext-load-crash.ll
new file mode 100644
index 0000000..07ded13
--- /dev/null
+++ b/test/CodeGen/X86/avx512-zext-load-crash.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
+
+define <8 x i16> @test_zext_load() {
+  ; CHECK: vmovq
+entry:
+  %0 = load <2 x i16> ** undef, align 8
+  %1 = getelementptr inbounds <2 x i16>* %0, i64 1
+  %2 = load <2 x i16>* %0, align 1
+  %3 = shufflevector <2 x i16> %2, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %4 = load <2 x i16>* %1, align 1
+  %5 = shufflevector <2 x i16> %4, <2 x i16> undef, <8 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
+  %6 = shufflevector <8 x i16> %3, <8 x i16> %5, <8 x i32> <i32 0, i32 1, i32 8, i32 9, i32 undef, i32 undef, i32 undef, i32 undef>
+  ret <8 x i16> %6
+}
diff --git a/test/CodeGen/X86/barrier-sse.ll b/test/CodeGen/X86/barrier-sse.ll
index bbfeea6..80c0cc8 100644
--- a/test/CodeGen/X86/barrier-sse.ll
+++ b/test/CodeGen/X86/barrier-sse.ll
@@ -1,11 +1,14 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep sfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep lfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | not grep mfence
-; RUN: llc < %s -march=x86 -mattr=+sse2 | grep MEMBARRIER
+; RUN: llc < %s -mtriple=i686-apple-darwin -mattr=+sse2 | FileCheck %s
 
 define void @test() {
   fence acquire
+  ; CHECK: #MEMBARRIER
+
   fence release
+  ; CHECK: #MEMBARRIER
+
   fence acq_rel
+  ; CHECK: #MEMBARRIER
+
   ret void
 }
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 4f2060f..6b46596 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -1,13 +1,11 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 
-; In this test we check that sign-extend of the mask bit is performed by
-; shifting the needed bit to the MSB, and not using shl+sra.
+; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: movl $-2147483648
-;CHECK-NEXT: movd
-;CHECK-NEXT: blendvps
+;CHECK-NOT: blendvps
+;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -15,9 +13,8 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: movl $-2147483648
-;CHECK-NEXT: movd
-;CHECK-NEXT: blendvps
+;CHECK-NOT: blendvps
+;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
@@ -26,12 +23,12 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 
 
 ; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not r
+; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
 ; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: psllw
-;CHECK: psraw
-;CHECK: pblendvb
+;CHECK: andps
+;CHECK: andps
+;CHECK: orps
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/block-placement.ll b/test/CodeGen/X86/block-placement.ll
index d3e05d6..2681c10 100644
--- a/test/CodeGen/X86/block-placement.ll
+++ b/test/CodeGen/X86/block-placement.ll
@@ -701,7 +701,7 @@ exit:
 
 define void @unanalyzable_branch_to_best_succ(i1 %cond) {
 ; Ensure that we can handle unanalyzable branches where the destination block
-; gets selected as the optimal sucessor to merge.
+; gets selected as the optimal successor to merge.
 ;
 ; CHECK: unanalyzable_branch_to_best_succ
 ; CHECK: %entry
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
new file mode 100644
index 0000000..6b77176
--- /dev/null
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mcpu=x86_64 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
+
+define <2 x i64> @foo(<2 x i64> %v) #0 {
+entry:
+  %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
+  ret <2 x i64> %r
+}
+
+; CHECK-LABEL: @foo
+; CHECK: bswapq
+; CHECK: bswapq
+; CHECK: retq
+
+attributes #0 = { nounwind uwtable }
+
diff --git a/test/CodeGen/X86/bt.ll b/test/CodeGen/X86/bt.ll
index f12a354..036ec0a 100644
--- a/test/CodeGen/X86/bt.ll
+++ b/test/CodeGen/X86/bt.ll
@@ -20,7 +20,7 @@
 define void @test2(i32 %x, i32 %n) nounwind {
 entry:
 ; CHECK: test2
-; CHECK: btl %eax, %ecx
+; CHECK: btl %ecx, %eax
 ; CHECK: jb
 	%tmp29 = lshr i32 %x, %n		; <i32> [#uses=1]
 	%tmp3 = and i32 %tmp29, 1		; <i32> [#uses=1]
diff --git a/test/CodeGen/X86/cache-intrinsic.ll b/test/CodeGen/X86/cache-intrinsic.ll
new file mode 100644
index 0000000..3091b5f
--- /dev/null
+++ b/test/CodeGen/X86/cache-intrinsic.ll
@@ -0,0 +1,26 @@
+; RUN: llc %s -o - | FileCheck %s
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@buffer = global [32 x i8] c"This is a largely unused buffer\00", align 16
+@.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
+@.str1 = private unnamed_addr constant [25 x i8] c"Still, largely unused...\00", align 1
+
+define i32 @main() {
+entry:
+  %retval = alloca i32, align 4
+  store i32 0, i32* %retval
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  %call1 = call i8* @strcpy(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds ([25 x i8]* @.str1, i32 0, i32 0)) #3
+  call void @llvm.clear_cache(i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i8* getelementptr inbounds (i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0), i32 32)) #3
+  %call3 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* getelementptr inbounds ([32 x i8]* @buffer, i32 0, i32 0))
+  ret i32 0
+}
+
+; CHECK-NOT: __clear_cache
+
+declare i32 @printf(i8*, ...)
+
+declare i8* @strcpy(i8*, i8*)
+
+declare void @llvm.clear_cache(i8*, i8*)
diff --git a/test/CodeGen/X86/call-imm.ll b/test/CodeGen/X86/call-imm.ll
index 8753594..898b4ec 100644
--- a/test/CodeGen/X86/call-imm.ll
+++ b/test/CodeGen/X86/call-imm.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=static | FileCheck -check-prefix X86STA %s
 ; RUN: llc < %s -mtriple=i386-apple-darwin -relocation-model=pic | FileCheck -check-prefix X86PIC %s
 ; RUN: llc < %s -mtriple=i386-pc-linux -relocation-model=dynamic-no-pic | FileCheck -check-prefix X86DYN %s
+; RUN: llc < %s -mtriple=i386-pc-win32 -relocation-model=static | FileCheck -check-prefix X86WINSTA %s
 
 ; Call to immediate is not safe on x86-64 unless we *know* that the
 ; call will be within 32-bits pcrel from the dest immediate.
@@ -20,4 +21,5 @@ entry:
 ; X86STA: {{call.*12345678}}
 ; X86PIC-NOT: {{call.*12345678}}
 ; X86DYN: {{call.*12345678}}
+; X86WINSTA: {{call.*[*]%eax}}
 ; X64: {{call.*[*]%rax}}
diff --git a/test/CodeGen/X86/cas.ll b/test/CodeGen/X86/cas.ll
index c2dd05e..ec519c6 100644
--- a/test/CodeGen/X86/cas.ll
+++ b/test/CodeGen/X86/cas.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o - | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux-gnu %s -o - -no-integrated-as | FileCheck %s
 
 ; C code this came from
 ;bool cas(float volatile *p, float *expected, float desired) {
diff --git a/test/CodeGen/X86/catch.ll b/test/CodeGen/X86/catch.ll
new file mode 100644
index 0000000..6f70213
--- /dev/null
+++ b/test/CodeGen/X86/catch.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -relocation-model=pic | FileCheck %s
+
+; PR18390
+; We used to assert creating this label. The name itself is not critical. It
+; just needs to be a unique local symbol.
+; CHECK:      .L.Lstr.DW.stub:
+; CHECK-NEXT: .quad   .Lstr
+
+@str = private unnamed_addr constant [12 x i8] c"NSException\00"
+define void @f() {
+  invoke void @g()
+          to label %invoke.cont unwind label %lpad
+invoke.cont:
+  ret void
+lpad:
+  %tmp14 = landingpad { i8*, i32 } personality i8* bitcast (void ()* @h to i8*)
+           catch i8* getelementptr inbounds ([12 x i8]* @str, i64 0, i64 0)
+  ret void
+}
+declare void @g()
+declare void @h()
diff --git a/test/CodeGen/X86/cdecl-method-return.ll b/test/CodeGen/X86/cdecl-method-return.ll
new file mode 100644
index 0000000..2baa47a
--- /dev/null
+++ b/test/CodeGen/X86/cdecl-method-return.ll
@@ -0,0 +1,69 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=core2 | FileCheck %s
+
+; The sret flag causes the first two parameters to be reordered on the stack.
+
+define x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src) {
+  %v = load i32* %src
+  store i32 %v, i32* %dst
+  ret void
+}
+
+; CHECK-LABEL: _foo:
+; CHECK:  movl    8(%esp), %[[dst:[^ ]*]]
+; CHECK:  movl    4(%esp), %[[src:[^ ]*]]
+; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
+; CHECK:  movl    %[[v]], (%[[dst]])
+; CHECK:  retl
+
+define i32 @bar() {
+  %src = alloca i32
+  %dst = alloca i32
+  store i32 42, i32* %src
+  call x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src)
+  %v = load i32* %dst
+  ret i32 %v
+}
+
+; CHECK-LABEL: _bar:
+; CHECK:  movl    $42, [[src:[^,]*]]
+; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], (%esp)
+; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], 4(%esp)
+; CHECK:  calll   _foo
+; CHECK:  movl    [[dst]], %eax
+; CHECK:  retl
+
+; If we don't have the sret flag, parameters are not reordered.
+
+define x86_cdeclmethodcc void @baz(i32* %dst, i32* %src) {
+  %v = load i32* %src
+  store i32 %v, i32* %dst
+  ret void
+}
+
+; CHECK-LABEL: _baz:
+; CHECK:  movl    4(%esp), %[[dst:[^ ]*]]
+; CHECK:  movl    8(%esp), %[[src:[^ ]*]]
+; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
+; CHECK:  movl    %[[v]], (%[[dst]])
+; CHECK:  retl
+
+define i32 @qux() {
+  %src = alloca i32
+  %dst = alloca i32
+  store i32 42, i32* %src
+  call x86_cdeclmethodcc void @baz(i32* %dst, i32* %src)
+  %v = load i32* %dst
+  ret i32 %v
+}
+
+; CHECK-LABEL: _qux:
+; CHECK:  movl    $42, [[src:[^,]*]]
+; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], 4(%esp)
+; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
+; CHECK:  movl    %[[reg]], (%esp)
+; CHECK:  calll   _baz
+; CHECK:  movl    [[dst]], %eax
+; CHECK:  retl
diff --git a/test/CodeGen/X86/cfstring.ll b/test/CodeGen/X86/cfstring.ll
index 8cdd59e..cae4320 100644
--- a/test/CodeGen/X86/cfstring.ll
+++ b/test/CodeGen/X86/cfstring.ll
@@ -7,7 +7,7 @@
 ; Make sure that the string ends up the correct section.
 
 ; CHECK:        .section __TEXT,__cstring
-; CHECK-NEXT: l_.str3:
+; CHECK-NEXT: L_.str3:
 
 ; CHECK:        .section  __DATA,__cfstring
 ; CHECK-NEXT:   .align  4
@@ -15,13 +15,13 @@
 ; CHECK-NEXT:   .quad  ___CFConstantStringClassReference
 ; CHECK-NEXT:   .long  1992
 ; CHECK-NEXT:   .space  4
-; CHECK-NEXT:   .quad  l_.str3
+; CHECK-NEXT:   .quad  L_.str3
 ; CHECK-NEXT:   .long  0
 ; CHECK-NEXT:   .space  4
 
 @isLogVisible = global i8 0, align 1
 @__CFConstantStringClassReference = external global [0 x i32]
-@.str3 = linker_private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@.str3 = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
 @_unnamed_cfstring_4 = private constant %struct.NSConstantString { i32* getelementptr inbounds ([0 x i32]* @__CFConstantStringClassReference, i32 0, i32 0), i32 1992, i8* getelementptr inbounds ([1 x i8]* @.str3, i32 0, i32 0), i32 0 }, section "__DATA,__cfstring"
 @null.array = weak_odr constant [1 x i8] zeroinitializer, align 1
 
diff --git a/test/CodeGen/X86/cmov.ll b/test/CodeGen/X86/cmov.ll
index 215b862..d38d2b4 100644
--- a/test/CodeGen/X86/cmov.ll
+++ b/test/CodeGen/X86/cmov.ll
@@ -41,8 +41,8 @@ declare void @bar(i64) nounwind
 
 define void @test3(i64 %a, i64 %b, i1 %p) nounwind {
 ; CHECK-LABEL: test3:
-; CHECK:      cmovnel %edi, %esi
-; CHECK-NEXT: movl    %esi, %edi
+; CHECK:      cmov{{n?}}el %[[R1:e..]], %[[R2:e..]]
+; CHECK-NEXT: movl    %[[R2]], %{{e..}}
 
   %c = trunc i64 %a to i32
   %d = trunc i64 %b to i32
diff --git a/test/CodeGen/X86/cmpxchg16b.ll b/test/CodeGen/X86/cmpxchg16b.ll
index edbd0bc..1d5bb85 100644
--- a/test/CodeGen/X86/cmpxchg16b.ll
+++ b/test/CodeGen/X86/cmpxchg16b.ll
@@ -6,7 +6,7 @@ entry:
 ; CHECK: movl	$1, %ebx
 ; CHECK: lock
 ; CHECK-NEXT: cmpxchg16b
-  %r = cmpxchg i128* %p, i128 0, i128 1 seq_cst
+  %r = cmpxchg i128* %p, i128 0, i128 1 seq_cst seq_cst
   ret void
 }
 
diff --git a/test/CodeGen/X86/coalescer-remat.ll b/test/CodeGen/X86/coalescer-remat.ll
index eb7b7a8..468b70b 100644
--- a/test/CodeGen/X86/coalescer-remat.ll
+++ b/test/CodeGen/X86/coalescer-remat.ll
@@ -5,7 +5,7 @@
 
 define i32 @main() nounwind {
 entry:
-  %0 = cmpxchg i64* @val, i64 0, i64 1 monotonic
+  %0 = cmpxchg i64* @val, i64 0, i64 1 monotonic monotonic
   %1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr ([7 x i8]* @"\01LC", i32 0, i64 0), i64 %0) nounwind
   ret i32 0
 }
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
new file mode 100644
index 0000000..e3d6b34
--- /dev/null
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -0,0 +1,303 @@
+; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; This file tests the different cases what are involved when codegen prepare
+; tries to get sign extension out of the way of addressing mode.
+; This tests require an actual target as addressing mode decisions depends
+; on the target.
+
+target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+
+; Check that we correctly promote both operands of the promotable add.
+; CHECK-LABEL: @twoArgsPromotion
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
+; CHECK: [[ARG2SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg2 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], [[ARG2SEXT]]
+; CHECK: inttoptr i64 [[PROMOTED]] to i8*
+; CHECK: ret
+define i8 @twoArgsPromotion(i32 %arg1, i32 %arg2) {
+  %add = add nsw i32 %arg1, %arg2 
+  %sextadd = sext i32 %add to i64
+  %base = inttoptr i64 %sextadd to i8*
+  %res = load i8* %base
+  ret i8 %res
+}
+
+; Check that we do not promote both operands of the promotable add when
+; the instruction will not be folded into the addressing mode.
+; Otherwise, we will increase the number of instruction executed.
+; (This is a heuristic of course, because the new sext could have been
+; merged with something else.)
+; CHECK-LABEL: @twoArgsNoPromotion
+; CHECK: add nsw i32 %arg1, %arg2
+; CHECK: ret
+define i8 @twoArgsNoPromotion(i32 %arg1, i32 %arg2, i8* %base) {
+  %add = add nsw i32 %arg1, %arg2 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote when the related instruction does not have
+; the nsw flag.
+; CHECK-LABEL: @noPromotion
+; CHECK-NOT: add i64
+; CHECK: ret
+define i8 @noPromotion(i32 %arg1, i32 %arg2, i8* %base) {
+  %add = add i32 %arg1, %arg2 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we correctly promote constant arguments.
+; CHECK-LABEL: @oneArgPromotion
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i32 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotion(i32 %arg1, i8* %base) {
+  %add = add nsw i32 %arg1, 1 
+  %sextadd = sext i32 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote truncate when we cannot determine the
+; bits that are dropped.
+; CHECK-LABEL: @oneArgPromotionBlockTrunc1
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 %arg1 to i8
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTrunc1(i32 %arg1, i8* %base) {
+  %trunc = trunc i32 %arg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we do not promote truncate when we cannot determine all the
+; bits that are dropped.
+; CHECK-LABEL: @oneArgPromotionBlockTrunc2
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i16 %arg1 to i32
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[ARG1SEXT]] to i8
+; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionBlockTrunc2(i16 %arg1, i8* %base) {
+  %sextarg1 = sext i16 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Check that we are able to promote truncate when we know all the bits
+; that are dropped.
+; CHECK-LABEL: @oneArgPromotionPassTruncKeepSExt
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionPassTruncKeepSExt(i1 %arg1, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; On X86 truncate are free. Check that we are able to promote the add
+; to be used as addressing mode and that we insert a truncate for the other
+; use. 
+; CHECK-LABEL: @oneArgPromotionTruncInsert
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
+; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8* [[GEP]]
+; CHECK: add i8 [[LOAD]], [[TRUNC]]
+; CHECK: ret
+define i8 @oneArgPromotionTruncInsert(i8 %arg1, i8* %base) {
+  %add = add nsw i8 %arg1, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %finalres = add i8 %res, %add
+  ret i8 %finalres
+}
+
+; Cannot sext from a larger type than the promoted type.
+; CHECK-LABEL: @oneArgPromotionLargerType
+; CHECK: [[ARG1TRUNC:%[a-zA-Z_0-9-]+]] = trunc i128 %arg1 to i8
+; CHECK: [[ARG1SEXT64:%[a-zA-Z_0-9-]+]] = sext i8 [[ARG1TRUNC]] to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT64]], 1
+; CHECK: getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: ret
+define i8 @oneArgPromotionLargerType(i128 %arg1, i8* %base) {
+  %trunc = trunc i128 %arg1 to i8
+  %add = add nsw i8 %trunc, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %finalres = add i8 %res, %add
+  ret i8 %finalres
+}
+
+; Use same inserted trunc
+; On X86 truncate are free. Check that we are able to promote the add
+; to be used as addressing mode and that we insert a truncate for
+; *all* the other uses. 
+; CHECK-LABEL: @oneArgPromotionTruncInsertSeveralUse
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i64 [[PROMOTED]] to i8
+; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8* [[GEP]]
+; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = add i8 [[LOAD]], [[TRUNC]]
+; CHECK: add i8 [[ADDRES]], [[TRUNC]]
+; CHECK: ret
+define i8 @oneArgPromotionTruncInsertSeveralUse(i8 %arg1, i8* %base) {
+  %add = add nsw i8 %arg1, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %almostfinalres = add i8 %res, %add
+  %finalres = add i8 %almostfinalres, %add
+  ret i8 %finalres
+}
+
+; Check that the promoted instruction is used for all uses of the original
+; sign extension.
+; CHECK-LABEL: @oneArgPromotionSExtSeveralUse
+; CHECK: [[ARG1SEXT:%[a-zA-Z_0-9-]+]] = sext i8 %arg1 to i64
+; CHECK: [[PROMOTED:%[a-zA-Z_0-9-]+]] = add nsw i64 [[ARG1SEXT]], 1
+; CHECK: [[GEP:%[a-zA-Z_0-9-]+]] = getelementptr inbounds i8* %base, i64 [[PROMOTED]]
+; CHECK: [[LOAD:%[a-zA-Z_0-9-]+]] = load i8* [[GEP]]
+; CHECK: [[ADDRES:%[a-zA-Z_0-9-]+]] = zext i8 [[LOAD]] to i64
+; CHECK: add i64 [[ADDRES]], [[PROMOTED]]
+; CHECK: ret
+define i64 @oneArgPromotionSExtSeveralUse(i8 %arg1, i8* %base) {
+  %add = add nsw i8 %arg1, 1 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  %almostfinalres = zext i8 %res to i64
+  %finalres = add i64 %almostfinalres, %sextadd
+  ret i64 %finalres
+}
+
+; Check all types of rollback mechanism.
+; For this test, the sign extension stays in place.
+; However, the matching process goes until promoting both the operands
+; of the first promotable add implies.
+; At this point the rollback mechanism kicks in and restores the states
+; until the addressing mode matcher is able to match something: in that
+; case promote nothing.
+; Along the way, the promotion mechanism involves:
+; - Mutating the type of %promotableadd1 and %promotableadd2.
+; - Creating a sext for %arg1 and %arg2.
+; - Creating a trunc for a use of %promotableadd1.
+; - Replacing a bunch of uses.
+; - Setting the operands of the promoted instruction with the promoted values.
+; - Moving instruction around (mainly sext when promoting instruction).
+; Each type of those promotions has to be undo at least once during this
+; specific test. 
+; CHECK-LABEL: @twoArgsPromotionNest
+; CHECK: [[ORIG:%[a-zA-Z_0-9-]+]] = add nsw i32 %arg1, %arg2
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[ORIG]], [[ORIG]]
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; CHECK: getelementptr inbounds i8* %base, i64 [[SEXT]]
+; CHECK: ret
+define i8 @twoArgsPromotionNest(i32 %arg1, i32 %arg2, i8* %base) {
+  %promotableadd1 = add nsw i32 %arg1, %arg2
+  %promotableadd2 = add nsw i32 %promotableadd1, %promotableadd1 
+  %sextadd = sext i32 %promotableadd2 to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Test the InstructionRemover undo, which was the only one not
+; kicked in the previous test.
+; The matcher first promotes the add, removes the trunc and promotes
+; the sext of arg1.
+; Then, the matcher cannot use an addressing mode r + r + r, thus it
+; rolls back. 
+; CHECK-LABEL: @twoArgsNoPromotionRemove
+; CHECK: [[SEXTARG1:%[a-zA-Z_0-9-]+]] = sext i1 %arg1 to i32
+; CHECK: [[TRUNC:%[a-zA-Z_0-9-]+]] = trunc i32 [[SEXTARG1]] to i8
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i8 [[TRUNC]], %arg2
+; CHECK: [[SEXT:%[a-zA-Z_0-9-]+]] = sext i8 [[ADD]] to i64
+; CHECK: getelementptr inbounds i8* %base, i64 [[SEXT]]
+; CHECK: ret
+define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, i8* %base) {
+  %sextarg1 = sext i1 %arg1 to i32
+  %trunc = trunc i32 %sextarg1 to i8
+  %add = add nsw i8 %trunc, %arg2 
+  %sextadd = sext i8 %add to i64
+  %arrayidx = getelementptr inbounds i8* %base, i64 %sextadd
+  %res = load i8* %arrayidx
+  ret i8 %res
+}
+
+; Ensure that when the profitability checks kicks in, the IR is not modified
+; will IgnoreProfitability is on.
+; The profitabily check happens when a candidate instruction has several uses.
+; The matcher will create a new matcher for each use and check if the
+; instruction is in the list of the matched instructions of this new matcher.
+; All changes made by the new matchers must be dropped before pursuing
+; otherwise the state of the original matcher will be wrong.
+;
+; Without the profitability check, when checking for the second use of
+; arrayidx, the matcher promotes everything all the way to %arg1, %arg2.
+; Check that we did not promote anything in the final matching.
+;
+; <rdar://problem/16020230>
+; CHECK-LABEL: @checkProfitability
+; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
+; CHECK-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
+; CHECK: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
+; CHECK: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
+; CHECK: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; BB then
+; CHECK: [[BASE1:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTADD]], 48
+; CHECK: [[ADDR1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE1]] to i32*
+; CHECK: load i32* [[ADDR1]]
+; BB else
+; CHECK: [[BASE2:%[a-zA-Z_0-9-]+]] = add i64 [[SEXTADD]], 48
+; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE2]] to i32*
+; CHECK: load i32* [[ADDR2]]
+; CHECK: ret
+define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
+  %shl = shl nsw i32 %arg1, 1
+  %add1 = add nsw i32 %shl, %arg2
+  %sextidx1 = sext i32 %add1 to i64
+  %tmpptr = inttoptr i64 %sextidx1 to i32*
+  %arrayidx1 = getelementptr i32* %tmpptr, i64 12
+  br i1 %test, label %then, label %else
+then: 
+  %res1 = load i32* %arrayidx1
+  br label %end
+else:
+  %res2 = load i32* %arrayidx1
+  br label %end
+end:
+  %tmp = phi i32 [%res1, %then], [%res2, %else]
+  %res = add i32 %tmp, %add1
+  %addr = inttoptr i32 %res to i32*
+  %final = load i32* %addr
+  ret i32 %final
+}
diff --git a/test/CodeGen/X86/codegen-prepare-cast.ll b/test/CodeGen/X86/codegen-prepare-cast.ll
index 2a8ead8..59c5133 100644
--- a/test/CodeGen/X86/codegen-prepare-cast.ll
+++ b/test/CodeGen/X86/codegen-prepare-cast.ll
@@ -1,5 +1,6 @@
 ; RUN: llc < %s -march=x86-64
 ; PR4297
+; RUN: opt -S < %s -codegenprepare | FileCheck %s
 
 target datalayout =
 "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
@@ -8,6 +9,9 @@ target triple = "x86_64-unknown-linux-gnu"
         %"char[][]" = type { i64, %"byte[]"* }
 @.str = external constant [7 x i8]              ; <[7 x i8]*> [#uses=1]
 
+; CHECK-LABEL: @_Dmain
+; CHECK: load i8* getelementptr inbounds ([7 x i8]* @.str, i32 0, i32 0)
+; CHECK ret
 define fastcc i32 @_Dmain(%"char[][]" %unnamed) {
 entry:
         %tmp = getelementptr [7 x i8]* @.str, i32 0, i32 0              ; <i8*> [#uses=1]
diff --git a/test/CodeGen/X86/codegen-prepare-extload.ll b/test/CodeGen/X86/codegen-prepare-extload.ll
index 14df815..9320706 100644
--- a/test/CodeGen/X86/codegen-prepare-extload.ll
+++ b/test/CodeGen/X86/codegen-prepare-extload.ll
@@ -5,7 +5,7 @@
 ; CodeGenPrepare should move the zext into the block with the load
 ; so that SelectionDAG can select it with the load.
 
-; CHECK: movzbl ({{%rdi|%rcx}}), %eax
+; CHECK: movsbl ({{%rdi|%rcx}}), %eax
 
 define void @foo(i8* %p, i32* %q) {
 entry:
diff --git a/test/CodeGen/X86/combine-or.ll b/test/CodeGen/X86/combine-or.ll
new file mode 100644
index 0000000..c1ce533
--- /dev/null
+++ b/test/CodeGen/X86/combine-or.ll
@@ -0,0 +1,269 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+
+; Verify that each of the following test cases is folded into a single
+; instruction which performs a blend operation.
+
+define <2 x i64> @test1(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK: ret
+
+
+define <2 x i64> @test3(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 1>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 1, i32 2, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test7(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 -1, i32 -1, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 0, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test8(<2 x i64> %a, <2 x i64> %b) {
+  %and1 = and <2 x i64> %a, <i64 -1, i64 0>
+  %and2 = and <2 x i64> %b, <i64 0, i64 -1>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test9(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 0, i32 0, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 -1, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK: ret
+
+
+define <2 x i64> @test10(<2 x i64> %a, <2 x i64> %b) {
+  %and1 = and <2 x i64> %a, <i64 0, i64 -1>
+  %and2 = and <2 x i64> %b, <i64 -1, i64 0>
+  %or = or <2 x i64> %and1, %and2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test11(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 -1, i32 0, i32 0, i32 0>
+  %and2 = and <4 x i32> %b, <i32 0, i32 -1, i32 -1, i32 -1>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <4 x i32> @test12(<4 x i32> %a, <4 x i32> %b) {
+  %and1 = and <4 x i32> %a, <i32 0, i32 -1, i32 -1, i32 -1>
+  %and2 = and <4 x i32> %b, <i32 -1, i32 0, i32 0, i32 0>
+  %or = or <4 x i32> %and1, %and2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NEXT: ret
+
+
+; Verify that the following test cases are folded into single shuffles.
+
+define <4 x i32> @test13(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 1, i32 1, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test14(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: pslldq
+; CHECK-NOT: por
+; CHECK: punpcklqdq
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test15(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 4, i32 2, i32 1>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 2, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: xorps
+; CHECK: shufps
+; CHECK-NOT: shufps
+; CHECK-NOT: orps
+; CHECK: ret
+
+
+define <2 x i64> @test16(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: pslldq
+; CHECK-NOT: por
+; CHECK: punpcklqdq
+; CHECK: ret
+
+
+; Verify that the dag-combiner does not fold a OR of two shuffles into a single
+; shuffle instruction when the shuffle indexes are not compatible.
+
+define <4 x i32> @test17(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 2>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 1, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test17
+; CHECK: por
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test18(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 4>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 4, i32 4>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test18
+; CHECK: orps
+; CHECK: ret
+
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> zeroinitializer, <4 x i32><i32 4, i32 0, i32 4, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> zeroinitializer, <4 x i32><i32 0, i32 4, i32 2, i32 2>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test19
+; CHECK: por
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test20(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 0, i32 2>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: xorps
+; CHECK: orps
+; CHECK-NEXT: movq
+; CHECK-NEXT: ret
+
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+  %shuf1 = shufflevector <2 x i64> %a, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %shuf2 = shufflevector <2 x i64> %b, <2 x i64> zeroinitializer, <2 x i32><i32 2, i32 0>
+  %or = or <2 x i64> %shuf1, %shuf2
+  ret <2 x i64> %or
+}
+; CHECK-LABEL: test21
+; CHECK: por
+; CHECK-NEXT: pslldq
+; CHECK-NEXT: ret
+
+
diff --git a/test/CodeGen/X86/combine-vec-shuffle.ll b/test/CodeGen/X86/combine-vec-shuffle.ll
new file mode 100644
index 0000000..9e6ab89
--- /dev/null
+++ b/test/CodeGen/X86/combine-vec-shuffle.ll
@@ -0,0 +1,253 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s
+
+; Verify that the DAGCombiner correctly folds according to the following rules:
+
+; fold (AND (shuf (A, C), shuf (B, C)) -> shuf (AND (A, B), C)
+; fold (OR  (shuf (A, C), shuf (B, C)) -> shuf (OR  (A, B), C)
+; fold (XOR (shuf (A, C), shuf (B, C)) -> shuf (XOR (A, B), V_0)
+
+; fold (AND (shuf (C, A), shuf (C, B)) -> shuf (C, AND (A, B))
+; fold (OR  (shuf (C, A), shuf (C, B)) -> shuf (C, OR  (A, B))
+; fold (XOR (shuf (C, A), shuf (C, B)) -> shuf (V_0, XOR (A, B))
+
+
+
+define <4 x i32> @test1(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: pshufd
+; CHECK: pand
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: pshufd
+; CHECK: por
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 1, i32 3>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: pshufd
+; CHECK: pxor
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: pshufd
+; CHECK: pand
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test5(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: pshufd
+; CHECK: por
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test6(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 4, i32 6, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: pshufd
+; CHECK: pxor
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+; Verify that DAGCombiner moves the shuffle after the xor/and/or even if shuffles
+; are not performing a swizzle operations.
+
+define <4 x i32> @test1b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1b
+; CHECK-NOT: blendps
+; CHECK: andps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2b
+; CHECK-NOT: blendps
+; CHECK: orps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3b
+; CHECK-NOT: blendps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: blendps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4b
+; CHECK-NOT: blendps
+; CHECK: andps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+
+define <4 x i32> @test5b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5b
+; CHECK-NOT: blendps
+; CHECK: orps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+
+define <4 x i32> @test6b(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 5, i32 2, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6b
+; CHECK-NOT: blendps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: blendps
+; CHECK: ret
+
+define <4 x i32> @test1c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test1c
+; CHECK-NOT: shufps
+; CHECK: andps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test2c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test2c
+; CHECK-NOT: shufps
+; CHECK: orps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test3c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %a, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %b, <4 x i32> %c, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test3c
+; CHECK-NOT: shufps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: shufps
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %and = and <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %and
+}
+; CHECK-LABEL: test4c
+; CHECK-NOT: shufps
+; CHECK: andps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
+
+define <4 x i32> @test5c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %or = or <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %or
+}
+; CHECK-LABEL: test5c
+; CHECK-NOT: shufps
+; CHECK: orps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
+
+define <4 x i32> @test6c(<4 x i32> %a, <4 x i32> %b, <4 x i32> %c) {
+  %shuf1 = shufflevector <4 x i32> %c, <4 x i32> %a, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %shuf2 = shufflevector <4 x i32> %c, <4 x i32> %b, <4 x i32><i32 0, i32 2, i32 5, i32 7>
+  %xor = xor <4 x i32> %shuf1, %shuf2
+  ret <4 x i32> %xor
+}
+; CHECK-LABEL: test6c
+; CHECK-NOT: shufps
+; CHECK: xorps
+; CHECK-NEXT: xorps
+; CHECK-NEXT: shufps
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/const-base-addr.ll b/test/CodeGen/X86/const-base-addr.ll
new file mode 100644
index 0000000..f859d7f
--- /dev/null
+++ b/test/CodeGen/X86/const-base-addr.ll
@@ -0,0 +1,24 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%T = type { i32, i32, i32, i32 }
+
+define i32 @test1() nounwind {
+; CHECK-LABEL:  test1
+; CHECK:        movabsq $123456789012345678, %rcx
+; CHECK-NEXT:   movl  4(%rcx), %eax
+; CHECK-NEXT:   addl  8(%rcx), %eax
+; CHECK-NEXT:   addl  12(%rcx), %eax
+  %addr1 = getelementptr %T* inttoptr (i64 123456789012345678 to %T*), i32 0, i32 1
+  %tmp1 = load i32* %addr1
+  %addr2 = getelementptr %T* inttoptr (i64 123456789012345678 to %T*), i32 0, i32 2
+  %tmp2 = load i32* %addr2
+  %addr3 = getelementptr %T* inttoptr (i64 123456789012345678 to %T*), i32 0, i32 3
+  %tmp3 = load i32* %addr3
+  %tmp4 = add i32 %tmp1, %tmp2
+  %tmp5 = add i32 %tmp3, %tmp4
+  ret i32 %tmp5
+}
+
diff --git a/test/CodeGen/X86/crash.ll b/test/CodeGen/X86/crash.ll
index 051150e..ee73377 100644
--- a/test/CodeGen/X86/crash.ll
+++ b/test/CodeGen/X86/crash.ll
@@ -1,7 +1,7 @@
 ; REQUIRES: asserts
-; RUN: llc -march=x86 < %s -verify-machineinstrs -precompute-phys-liveness
-; RUN: llc -march=x86-64 < %s -verify-machineinstrs -precompute-phys-liveness
-
+; RUN: llc -march=x86 -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
+; RUN: llc -march=x86-64 -no-integrated-as < %s -verify-machineinstrs -precompute-phys-liveness
+ 
 ; PR6497
 
 ; Chain and flag folding issues.
diff --git a/test/CodeGen/X86/cse-add-with-overflow.ll b/test/CodeGen/X86/cse-add-with-overflow.ll
new file mode 100644
index 0000000..1fcc03f
--- /dev/null
+++ b/test/CodeGen/X86/cse-add-with-overflow.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=generic | FileCheck %s
+; XFAIL: *
+; rdar:15661073 simple example of redundant adds
+;
+; MachineCSE should coalesce trivial subregister copies.
+;
+; The extra movl+addl should be removed during MachineCSE.
+; CHECK-LABEL: redundantadd
+; CHECK: cmpq
+; CHECK: movq
+; CHECK-NOT: movl
+; CHECK: addl
+; CHECK-NOT: addl
+; CHECK: ret
+
+define i64 @redundantadd(i64* %a0, i64* %a1) {
+entry:
+  %tmp8 = load i64* %a0, align 8
+  %tmp12 = load i64* %a1, align 8
+  %tmp13 = icmp ult i64 %tmp12, -281474976710656
+  br i1 %tmp13, label %exit1, label %body
+
+exit1:
+  unreachable
+
+body:
+  %tmp14 = trunc i64 %tmp8 to i32
+  %tmp15 = trunc i64 %tmp12 to i32
+  %tmp16 = tail call { i32, i1 } @llvm.sadd.with.overflow.i32(i32 %tmp14, i32 %tmp15)
+  %tmp17 = extractvalue { i32, i1 } %tmp16, 1
+  br i1 %tmp17, label %exit2, label %return
+
+exit2:
+  unreachable
+
+return:
+  %tmp18 = add i64 %tmp12, %tmp8
+  %tmp19 = and i64 %tmp18, 4294967295
+  %tmp20 = or i64 %tmp19, -281474976710656
+  ret i64 %tmp20
+}
+
+declare { i32, i1 } @llvm.sadd.with.overflow.i32(i32, i32)
diff --git a/test/CodeGen/X86/ctpop-combine.ll b/test/CodeGen/X86/ctpop-combine.ll
index 786f7f9..463505b 100644
--- a/test/CodeGen/X86/ctpop-combine.ll
+++ b/test/CodeGen/X86/ctpop-combine.ll
@@ -35,6 +35,6 @@ define i32 @test3(i64 %x) nounwind readnone {
   %conv = zext i1 %cmp to i32
   ret i32 %conv
 ; CHECK-LABEL: test3:
-; CHECK: cmpb $2
+; CHECK: cmpl $2
 ; CHECK: ret
 }
diff --git a/test/CodeGen/X86/darwin-no-dead-strip.ll b/test/CodeGen/X86/darwin-no-dead-strip.ll
index 452d1f8..35196aa 100644
--- a/test/CodeGen/X86/darwin-no-dead-strip.ll
+++ b/test/CodeGen/X86/darwin-no-dead-strip.ll
@@ -1,7 +1,13 @@
-; RUN: llc < %s | grep no_dead_strip
+; RUN: llc < %s | FileCheck %s
 
 target datalayout = "e-p:32:32"
 target triple = "i686-apple-darwin8.7.2"
-@x = weak global i32 0          ; <i32*> [#uses=1]
-@llvm.used = appending global [1 x i8*] [ i8* bitcast (i32* @x to i8*) ]                ; <[1 x i8*]*> [#uses=0]
 
+@x = weak global i32 0
+; CHECK: .no_dead_strip	_x
+
+@"\01Ly" = private global i8 0
+; CHECK: no_dead_strip Ly
+
+@llvm.used = appending global [2 x i8*] [ i8* bitcast (i32* @x to i8*),
+            i8* @"\01Ly" ]
diff --git a/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
new file mode 100644
index 0000000..23f8335
--- /dev/null
+++ b/test/CodeGen/X86/dbg-changes-codegen-branch-folding.ll
@@ -0,0 +1,109 @@
+; RUN: llc -march=x86-64 -mtriple=x86_64-linux < %s | FileCheck %s
+; RUN: opt -strip-debug < %s | llc -march=x86-64 -mtriple=x86_64-linux | FileCheck %s
+; http://llvm.org/PR19051. Minor code-motion difference with -g.
+; Presence of debug info shouldn't affect the codegen. Make sure that
+; we generated the same code sequence with and without debug info. 
+;
+; CHECK:      callq   _Z3fooPcjPKc
+; CHECK:      callq   _Z3fooPcjPKc
+; CHECK:      leaq    (%rsp), %rdi
+; CHECK:      movl    $4, %esi
+; CHECK:      testl   {{%[a-z]+}}, {{%[a-z]+}}
+; CHECK:      je     .LBB0_4
+
+; Regenerate test with this command: 
+;   clang -emit-llvm -S -O2 -g
+; from this source:
+;
+; extern void foo(char *dst,unsigned siz,const char *src);
+; extern const char * i2str(int);
+;
+; struct AAA3 {
+;  AAA3(const char *value) { foo(text,sizeof(text),value);}
+;  void operator=(const char *value) { foo(text,sizeof(text),value);}
+;  operator const char*() const { return text;}
+;  char text[4];
+; };
+;
+; void bar (int param1,int param2)  {
+;   const char * temp(0);
+;
+;   if (param2) {
+;     temp = i2str(param2);
+;   }
+;   AAA3 var1("");
+;   AAA3 var2("");
+;
+;   if (param1)
+;     var2 = "+";
+;   else
+;     var2 = "-";
+;   var1 = "";
+; }
+
+%struct.AAA3 = type { [4 x i8] }
+
+@.str = private unnamed_addr constant [1 x i8] zeroinitializer, align 1
+@.str1 = private unnamed_addr constant [2 x i8] c"+\00", align 1
+@.str2 = private unnamed_addr constant [2 x i8] c"-\00", align 1
+
+; Function Attrs: uwtable
+define void @_Z3barii(i32 %param1, i32 %param2) #0 {
+entry:
+  %var1 = alloca %struct.AAA3, align 1
+  %var2 = alloca %struct.AAA3, align 1
+  %tobool = icmp eq i32 %param2, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  %call = call i8* @_Z5i2stri(i32 %param2)
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  call void @llvm.dbg.value(metadata !{%struct.AAA3* %var1}, i64 0, metadata !60)
+  call void @llvm.dbg.value(metadata !62, i64 0, metadata !63)
+  %arraydecay.i = getelementptr inbounds %struct.AAA3* %var1, i64 0, i32 0, i64 0
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
+  call void @llvm.dbg.declare(metadata !{%struct.AAA3* %var2}, metadata !38)
+  %arraydecay.i5 = getelementptr inbounds %struct.AAA3* %var2, i64 0, i32 0, i64 0
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
+  %tobool1 = icmp eq i32 %param1, 0
+  br i1 %tobool1, label %if.else, label %if.then2
+
+if.then2:                                         ; preds = %if.end
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str1, i64 0, i64 0))
+  br label %if.end3
+
+if.else:                                          ; preds = %if.end
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i5, i32 4, i8* getelementptr inbounds ([2 x i8]* @.str2, i64 0, i64 0))
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.else, %if.then2
+  call void @_Z3fooPcjPKc(i8* %arraydecay.i, i32 4, i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0))
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.declare(metadata, metadata) #1
+
+declare i8* @_Z5i2stri(i32) #2
+
+declare void @_Z3fooPcjPKc(i8*, i32, i8*) #2
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #1
+
+attributes #0 = { uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind readnone }
+attributes #2 = { "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+!llvm.module.flags = !{!48, !49}
+!llvm.ident = !{!50}
+
+!38 = metadata !{i32 786688, null, metadata !"var2", null, i32 20, null, i32 0, i32 0} ; [ DW_TAG_auto_variable ] [var2] [line 20]
+!48 = metadata !{i32 2, metadata !"Dwarf Version", i32 4}
+!49 = metadata !{i32 1, metadata !"Debug Info Version", i32 1}
+!50 = metadata !{metadata !"clang version 3.5 (202418)"}
+!60 = metadata !{i32 786689, null, metadata !"this", null, i32 16777216, null, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 0]
+!62 = metadata !{i8* getelementptr inbounds ([1 x i8]* @.str, i64 0, i64 0)}
+!63 = metadata !{i32 786689, null, metadata !"value", null, i32 33554439, null, i32 0, null} ; [ DW_TAG_arg_variable ] [value] [line 7]
diff --git a/test/CodeGen/X86/dbg-changes-codegen.ll b/test/CodeGen/X86/dbg-changes-codegen.ll
new file mode 100644
index 0000000..0b17c45
--- /dev/null
+++ b/test/CodeGen/X86/dbg-changes-codegen.ll
@@ -0,0 +1,83 @@
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux | FileCheck %s
+
+; The Peephole optimizer should fold the load into the cmp even with debug info.
+; CHECK-LABEL: _ZN3Foo3batEv
+; CHECK-NOT: movq pfoo
+; CHECK: cmpq {{%[a-z]+}}, pfoo(%rip)
+;
+; CHECK-LABEL: _Z3bazv
+; CHECK-NOT: movq wibble2
+; CHECK: cmpq {{%[a-z]+}}, wibble2(%rip)
+
+; Regenerate test with this command: 
+;   clang -emit-llvm -S -O2 -g
+; from this source:
+;   struct Foo {
+;     bool bat();
+;     bool operator==(Foo &arg) { return (this == &arg); }
+;   };
+;   Foo *pfoo;
+;   bool Foo::bat() { return (*this == *pfoo); }
+;
+;   struct Wibble {
+;     int x;
+;   } *wibble1, *wibble2;
+;   struct Flibble {
+;     void bar(Wibble *c) {
+;       if (c < wibble2)
+;         wibble2 = 0;
+;       c->x = 0;
+;     }
+;   } flibble;
+;   void baz() { flibble.bar(wibble1); }
+
+%struct.Foo = type { i8 }
+%struct.Wibble = type { i32 }
+%struct.Flibble = type { i8 }
+
+@pfoo = global %struct.Foo* null, align 8
+@wibble1 = global %struct.Wibble* null, align 8
+@wibble2 = global %struct.Wibble* null, align 8
+@flibble = global %struct.Flibble zeroinitializer, align 1
+
+; Function Attrs: nounwind readonly uwtable
+define zeroext i1 @_ZN3Foo3batEv(%struct.Foo* %this) #0 align 2 {
+entry:
+  %0 = load %struct.Foo** @pfoo, align 8
+  tail call void @llvm.dbg.value(metadata !{%struct.Foo* %0}, i64 0, metadata !62)
+  %cmp.i = icmp eq %struct.Foo* %0, %this
+  ret i1 %cmp.i
+}
+
+; Function Attrs: nounwind uwtable
+define void @_Z3bazv() #1 {
+entry:
+  %0 = load %struct.Wibble** @wibble1, align 8
+  tail call void @llvm.dbg.value(metadata !64, i64 0, metadata !65)
+  %1 = load %struct.Wibble** @wibble2, align 8
+  %cmp.i = icmp ugt %struct.Wibble* %1, %0
+  br i1 %cmp.i, label %if.then.i, label %_ZN7Flibble3barEP6Wibble.exit
+
+if.then.i:                                        ; preds = %entry
+  store %struct.Wibble* null, %struct.Wibble** @wibble2, align 8
+  br label %_ZN7Flibble3barEP6Wibble.exit
+
+_ZN7Flibble3barEP6Wibble.exit:                    ; preds = %entry, %if.then.i
+  %x.i = getelementptr inbounds %struct.Wibble* %0, i64 0, i32 0
+  store i32 0, i32* %x.i, align 4
+  ret void
+}
+
+; Function Attrs: nounwind readnone
+declare void @llvm.dbg.value(metadata, i64, metadata) #2
+
+attributes #0 = { nounwind readonly uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #1 = { nounwind uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-frame-pointer-elim-non-leaf"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "unsafe-fp-math"="false" "use-soft-float"="false" }
+attributes #2 = { nounwind readnone }
+
+
+!17 = metadata !{i32 786448, null, null, null, i32 0, i64 0, i64 0, i64 0, i32 0, null} ; [ DW_TAG_reference_type ] [line 0, size 0, align 0, offset 0] [from Foo]
+!45 = metadata !{i32 786447, null, null, metadata !"", i32 0, i64 64, i64 64, i64 0, i32 0, null} ; [ DW_TAG_pointer_type ] [line 0, size 64, align 64, offset 0] [from Flibble]
+!62 = metadata !{i32 786689, null, metadata !"arg", null, i32 33554436, metadata !17, i32 0, null} ; [ DW_TAG_arg_variable ] [arg] [line 4]
+!64 = metadata !{%struct.Flibble* undef}
+!65 = metadata !{i32 786689, null, metadata !"this", null, i32 16777229, metadata !45, i32 1088, null} ; [ DW_TAG_arg_variable ] [this] [line 13]
diff --git a/test/CodeGen/X86/dll-linkage.ll b/test/CodeGen/X86/dll-linkage.ll
deleted file mode 100644
index a0c2a54..0000000
--- a/test/CodeGen/X86/dll-linkage.ll
+++ /dev/null
@@ -1,14 +0,0 @@
-; RUN: llc < %s -mtriple=i386-pc-mingw32 | FileCheck %s
-
-; RUN: llc < %s -mtriple=i386-pc-mingw32 -O0 | FileCheck %s -check-prefix=FAST
-; PR6275
-
-declare dllimport void @foo()
-
-define void @bar() nounwind {
-; CHECK: calll	*__imp__foo
-; FAST:  movl   __imp__foo, [[R:%[a-z]{3}]]
-; FAST:  calll  *[[R]]
-  call void @foo()
-  ret void
-}
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
new file mode 100644
index 0000000..a38c2d8
--- /dev/null
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -0,0 +1,104 @@
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+
+; CHECK: .text
+
+define void @notExported() {
+	ret void
+}
+
+; CHECK: .globl f1
+define dllexport void @f1() {
+	ret void
+}
+
+; CHECK: .globl f2
+define dllexport void @f2() unnamed_addr {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,lnk1
+; CHECK: .globl lnk1
+define linkonce_odr dllexport void @lnk1() {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,lnk2
+; CHECK: .globl lnk2
+define linkonce_odr dllexport void @lnk2() alwaysinline {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,weak1
+; CHECK: .globl weak1
+define weak_odr dllexport void @weak1() {
+	ret void
+}
+
+
+; CHECK: .data
+; CHECK: .globl Var1
+@Var1 = dllexport global i32 1, align 4
+
+; CHECK: .rdata,"r"
+; CHECK: .globl Var2
+@Var2 = dllexport unnamed_addr constant i32 1
+
+; CHECK: .comm Var3
+@Var3 = common dllexport global i32 0, align 4
+
+; CHECK: .section .data,"w",discard,WeakVar1
+; CHECK: .globl WeakVar1
+@WeakVar1 = weak_odr dllexport global i32 1, align 4
+
+; CHECK: .section .rdata,"r",discard,WeakVar2
+; CHECK: .globl WeakVar2
+@WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
+
+
+; CHECK: .globl alias
+; CHECK: alias = notExported
+@alias = dllexport alias void()* @notExported
+
+; CHECK: .globl alias2
+; CHECK: alias2 = f1
+@alias2 = dllexport alias void()* @f1
+
+; CHECK: .globl alias3
+; CHECK: alias3 = notExported
+@alias3 = dllexport alias void()* @alias
+
+; CHECK: .weak weak_alias
+; CHECK: weak_alias = f1
+@weak_alias = dllexport alias weak_odr void()* @f1
+
+
+; CHECK: .section .drectve
+; WIN32: /EXPORT:Var1,DATA
+; WIN32: /EXPORT:Var2,DATA
+; WIN32: /EXPORT:Var3,DATA
+; WIN32: /EXPORT:WeakVar1,DATA
+; WIN32: /EXPORT:WeakVar2,DATA
+; WIN32: /EXPORT:f1
+; WIN32: /EXPORT:f2
+; WIN32: /EXPORT:lnk1
+; WIN32: /EXPORT:lnk2
+; WIN32: /EXPORT:weak1
+; WIN32: /EXPORT:alias
+; WIN32: /EXPORT:alias2
+; WIN32: /EXPORT:alias3
+; WIN32: /EXPORT:weak_alias
+; MINGW: -export:Var1,data
+; MINGW: -export:Var2,data
+; MINGW: -export:Var3,data
+; MINGW: -export:WeakVar1,data
+; MINGW: -export:WeakVar2,data
+; MINGW: -export:f1
+; MINGW: -export:f2
+; MINGW: -export:lnk1
+; MINGW: -export:lnk2
+; MINGW: -export:weak1
+; MINGW: -export:alias
+; MINGW: -export:alias2
+; MINGW: -export:alias3
+; MINGW: -export:weak_alias
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index bf57e78..1b34d23 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -1,12 +1,125 @@
-; RUN: llc < %s | FileCheck %s
-; PR2936
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
 
-target triple = "i386-pc-mingw32"
+; CHECK: .text
 
-define dllexport x86_fastcallcc i32 @foo() nounwind  {
-entry:
+define void @notExported() {
+	ret void
+}
+
+; CHECK: .globl _f1
+define dllexport void @f1() {
+	ret void
+}
+
+; CHECK: .globl _f2
+define dllexport void @f2() unnamed_addr {
+	ret void
+}
+
+; CHECK: .globl _stdfun@0
+define dllexport x86_stdcallcc void @stdfun() nounwind {
+	ret void
+}
+
+; CHECK: .globl @fastfun@0
+define dllexport x86_fastcallcc i32 @fastfun() nounwind {
 	ret i32 0
 }
 
+; CHECK: .globl _thisfun
+define dllexport x86_thiscallcc void @thisfun() nounwind {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,_lnk1
+; CHECK: .globl _lnk1
+define linkonce_odr dllexport void @lnk1() {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,_lnk2
+; CHECK: .globl _lnk2
+define linkonce_odr dllexport void @lnk2() alwaysinline {
+	ret void
+}
+
+; CHECK: .section .text,"xr",discard,_weak1
+; CHECK: .globl _weak1
+define weak_odr dllexport void @weak1() {
+	ret void
+}
+
+
+; CHECK: .data
+; CHECK: .globl _Var1
+@Var1 = dllexport global i32 1, align 4
+
+; CHECK: .rdata,"r"
+; CHECK: .globl _Var2
+@Var2 = dllexport unnamed_addr constant i32 1
+
+; CHECK: .comm _Var3
+@Var3 = common dllexport global i32 0, align 4
+
+; CHECK: .section .data,"w",discard,_WeakVar1
+; CHECK: .globl _WeakVar1
+@WeakVar1 = weak_odr dllexport global i32 1, align 4
+
+; CHECK: .section .rdata,"r",discard,_WeakVar2
+; CHECK: .globl _WeakVar2
+@WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
+
+
+; CHECK: .globl _alias
+; CHECK: _alias = _notExported
+@alias = dllexport alias void()* @notExported
+
+; CHECK: .globl _alias2
+; CHECK: _alias2 = _f1
+@alias2 = dllexport alias void()* @f1
+
+; CHECK: .globl _alias3
+; CHECK: _alias3 = _notExported
+@alias3 = dllexport alias void()* @alias
+
+; CHECK: .weak _weak_alias
+; CHECK: _weak_alias = _f1
+@weak_alias = dllexport alias weak_odr void()* @f1
+
+
 ; CHECK: .section .drectve
-; CHECK: -export:@foo@0
+; WIN32: /EXPORT:_Var1,DATA
+; WIN32: /EXPORT:_Var2,DATA
+; WIN32: /EXPORT:_Var3,DATA
+; WIN32: /EXPORT:_WeakVar1,DATA
+; WIN32: /EXPORT:_WeakVar2,DATA
+; WIN32: /EXPORT:_f1
+; WIN32: /EXPORT:_f2
+; WIN32: /EXPORT:_stdfun@0
+; WIN32: /EXPORT:@fastfun@0
+; WIN32: /EXPORT:_thisfun
+; WIN32: /EXPORT:_lnk1
+; WIN32: /EXPORT:_lnk2
+; WIN32: /EXPORT:_weak1
+; WIN32: /EXPORT:_alias
+; WIN32: /EXPORT:_alias2
+; WIN32: /EXPORT:_alias3
+; WIN32: /EXPORT:_weak_alias
+; MINGW: -export:_Var1,data
+; MINGW: -export:_Var2,data
+; MINGW: -export:_Var3,data
+; MINGW: -export:_WeakVar1,data
+; MINGW: -export:_WeakVar2,data
+; MINGW: -export:_f1
+; MINGW: -export:_f2
+; MINGW: -export:_stdfun@0
+; MINGW: -export:@fastfun@0
+; MINGW: -export:_thisfun
+; MINGW: -export:_lnk1
+; MINGW: -export:_lnk2
+; MINGW: -export:_weak1
+; MINGW: -export:_alias
+; MINGW: -export:_alias2
+; MINGW: -export:_alias3
+; MINGW: -export:_weak_alias
diff --git a/test/CodeGen/X86/dllimport-x86_64.ll b/test/CodeGen/X86/dllimport-x86_64.ll
new file mode 100644
index 0000000..666409f
--- /dev/null
+++ b/test/CodeGen/X86/dllimport-x86_64.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple x86_64-pc-win32 < %s | FileCheck %s
+; RUN: llc -mtriple x86_64-pc-mingw32 < %s | FileCheck %s
+;
+; RUN: llc -mtriple x86_64-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
+; PR6275
+;
+; RUN: opt -mtriple x86_64-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+
+@Var1 = external dllimport global i32
+@Var2 = available_externally dllimport unnamed_addr constant i32 1
+
+declare dllimport void @fun()
+
+define available_externally dllimport void @inline1() {
+	ret void
+}
+
+define available_externally dllimport void @inline2() {
+	ret void
+}
+
+declare void @dummy(...)
+
+define void @use() nounwind {
+; CHECK:     callq *__imp_fun(%rip)
+; FAST:      movq  __imp_fun(%rip), [[R:%[a-z]{3}]]
+; FAST-NEXT: callq *[[R]]
+  call void @fun()
+
+; CHECK: callq *__imp_inline1(%rip)
+; CHECK: callq *__imp_inline2(%rip)
+  call void @inline1()
+  call void @inline2()
+
+; available_externally uses go away
+; OPT-NOT: call void @inline1()
+; OPT-NOT: call void @inline2()
+; OPT-NOT: load i32* @Var2
+; OPT: call void (...)* @dummy(i32 %1, i32 1)
+
+; CHECK-DAG: movq __imp_Var1(%rip), [[R1:%[a-z]{3}]]
+; CHECK-DAG: movq __imp_Var2(%rip), [[R2:%[a-z]{3}]]
+  %1 = load i32* @Var1
+  %2 = load i32* @Var2
+  call void(...)* @dummy(i32 %1, i32 %2)
+
+  ret void
+}
diff --git a/test/CodeGen/X86/dllimport.ll b/test/CodeGen/X86/dllimport.ll
new file mode 100644
index 0000000..695bfce
--- /dev/null
+++ b/test/CodeGen/X86/dllimport.ll
@@ -0,0 +1,59 @@
+; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck %s
+;
+; RUN: llc -mtriple i386-pc-mingw32 -O0 < %s | FileCheck %s -check-prefix=FAST
+; PR6275
+;
+; RUN: opt -mtriple i386-pc-win32 -std-compile-opts -S < %s | FileCheck %s -check-prefix=OPT
+
+@Var1 = external dllimport global i32
+@Var2 = available_externally dllimport unnamed_addr constant i32 1
+
+declare dllimport void @fun()
+
+define available_externally dllimport void @inline1() {
+	ret void
+}
+
+define available_externally dllimport void @inline2() alwaysinline {
+	ret void
+}
+
+declare dllimport x86_stdcallcc void @stdfun() nounwind
+declare dllimport x86_fastcallcc void @fastfun() nounwind
+declare dllimport x86_thiscallcc void @thisfun() nounwind
+
+declare void @dummy(...)
+
+define void @use() nounwind {
+; CHECK:     calll *__imp__fun
+; FAST:      movl  __imp__fun, [[R:%[a-z]{3}]]
+; FAST-NEXT: calll *[[R]]
+  call void @fun()
+
+; CHECK: calll *__imp__inline1
+; CHECK: calll *__imp__inline2
+  call void @inline1()
+  call void @inline2()
+
+; CHECK: calll *__imp__stdfun@0
+; CHECK: calll *__imp_@fastfun@0
+; CHECK: calll *__imp__thisfun
+  call void @stdfun()
+  call void @fastfun()
+  call void @thisfun()
+
+; available_externally uses go away
+; OPT-NOT: call void @inline1()
+; OPT-NOT: call void @inline2()
+; OPT-NOT: load i32* @Var2
+; OPT: call void (...)* @dummy(i32 %1, i32 1)
+
+; CHECK-DAG: movl __imp__Var1, [[R1:%[a-z]{3}]]
+; CHECK-DAG: movl __imp__Var2, [[R2:%[a-z]{3}]]
+  %1 = load i32* @Var1
+  %2 = load i32* @Var2
+  call void(...)* @dummy(i32 %1, i32 %2)
+
+  ret void
+}
diff --git a/test/CodeGen/X86/dwarf-comp-dir.ll b/test/CodeGen/X86/dwarf-comp-dir.ll
index 3b4a868..c8d7527 100644
--- a/test/CodeGen/X86/dwarf-comp-dir.ll
+++ b/test/CodeGen/X86/dwarf-comp-dir.ll
@@ -7,10 +7,12 @@ target triple = "x86_64-unknown-linux-gnu"
 !llvm.dbg.cu = !{!0}
 !llvm.module.flags = !{!5}
 
-!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !2, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
-!2 = metadata !{i32 0}
+!0 = metadata !{i32 720913, metadata !4, i32 12, metadata !"clang version 3.1 (trunk 143523)", i1 true, metadata !"", i32 0, metadata !2, metadata !7, metadata !2, metadata !2, null, metadata !""} ; [ DW_TAG_compile_unit ]
+!2 = metadata !{}
 !3 = metadata !{i32 786473, metadata !4} ; [ DW_TAG_file_type ]
 !4 = metadata !{metadata !"empty.c", metadata !"/home/nlewycky"}
+!6 = metadata !{i32 786451, metadata !4, null, metadata !"foo", i32 1, i64 8, i64 8, i32 0, i32 0, null, metadata !2, i32 0, null, null, metadata !"_ZTS3foo"} ; [ DW_TAG_structure_type ] [foo] [line 1, size 8, align 8, offset 0] [def] [from ]
+!7 = metadata !{metadata !6}
 
 ; The important part of the following check is that dir = #0.
 ;                        Dir  Mod Time   File Len   File Name
diff --git a/test/CodeGen/X86/dynamic-alloca-in-entry.ll b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
new file mode 100644
index 0000000..7ed471c
--- /dev/null
+++ b/test/CodeGen/X86/dynamic-alloca-in-entry.ll
@@ -0,0 +1,19 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+; Allocas with unknown size in the entry block are dynamic.
+define void @foo(i32 %n) {
+  %m = alloca i32, i32 %n
+  ret void
+}
+; CHECK-LABEL: _foo:
+; CHECK: calll __chkstk
+; CHECK: retl
+
+; Use of inalloca implies that that the alloca is not static.
+define void @bar() {
+  %m = alloca inalloca i32
+  ret void
+}
+; CHECK-LABEL: _bar:
+; CHECK: calll __chkstk
+; CHECK: retl
diff --git a/test/CodeGen/X86/exedepsfix-broadcast.ll b/test/CodeGen/X86/exedepsfix-broadcast.ll
new file mode 100644
index 0000000..a18f751
--- /dev/null
+++ b/test/CodeGen/X86/exedepsfix-broadcast.ll
@@ -0,0 +1,128 @@
+; RUN: llc -O3 -mtriple=x86_64-apple-macosx -o - < %s -mattr=+avx2 -enable-unsafe-fp-math -mcpu=core2 | FileCheck %s
+; Check that the ExeDepsFix pass correctly fixes the domain for broadcast instructions.
+; <rdar://problem/16354675>
+
+; CHECK-LABEL: ExeDepsFix_broadcastss
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <4 x float> @ExeDepsFix_broadcastss(<4 x float> %arg, <4 x float> %arg2) {
+  %bitcast = bitcast <4 x float> %arg to <4 x i32>
+  %and = and <4 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %floatcast = bitcast <4 x i32> %and to <4 x float>
+  %max_is_x = fcmp oge <4 x float> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x float> %floatcast, <4 x float> %arg2
+  ret <4 x float> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastss256
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <8 x float> @ExeDepsFix_broadcastss256(<8 x float> %arg, <8 x float> %arg2) {
+  %bitcast = bitcast <8 x float> %arg to <8 x i32>
+  %and = and <8 x i32> %bitcast, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %floatcast = bitcast <8 x i32> %and to <8 x float>
+  %max_is_x = fcmp oge <8 x float> %floatcast, %arg2
+  %max = select <8 x i1> %max_is_x, <8 x float> %floatcast, <8 x float> %arg2
+  ret <8 x float> %max
+}
+
+
+; CHECK-LABEL: ExeDepsFix_broadcastss_inreg
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <4 x float> @ExeDepsFix_broadcastss_inreg(<4 x float> %arg, <4 x float> %arg2, i32 %broadcastvalue) {
+  %bitcast = bitcast <4 x float> %arg to <4 x i32>
+  %in = insertelement <4 x i32> undef, i32 %broadcastvalue, i32 0
+  %mask = shufflevector <4 x i32> %in, <4 x i32> undef, <4 x i32> zeroinitializer
+  %and = and <4 x i32> %bitcast, %mask
+  %floatcast = bitcast <4 x i32> %and to <4 x float>
+  %max_is_x = fcmp oge <4 x float> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x float> %floatcast, <4 x float> %arg2
+  ret <4 x float> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastss256_inreg
+; CHECK: broadcastss
+; CHECK: vandps
+; CHECK: vmaxps
+; CHECK: ret
+define <8 x float> @ExeDepsFix_broadcastss256_inreg(<8 x float> %arg, <8 x float> %arg2, i32 %broadcastvalue) {
+  %bitcast = bitcast <8 x float> %arg to <8 x i32>
+  %in = insertelement <8 x i32> undef, i32 %broadcastvalue, i32 0
+  %mask = shufflevector <8 x i32> %in, <8 x i32> undef, <8 x i32> zeroinitializer
+  %and = and <8 x i32> %bitcast, %mask
+  %floatcast = bitcast <8 x i32> %and to <8 x float>
+  %max_is_x = fcmp oge <8 x float> %floatcast, %arg2
+  %max = select <8 x i1> %max_is_x, <8 x float> %floatcast, <8 x float> %arg2
+  ret <8 x float> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd
+; In that case the broadcast is directly folded into vandpd.
+; CHECK: vandpd
+; CHECK: vmaxpd
+; CHECK:ret
+define <2 x double> @ExeDepsFix_broadcastsd(<2 x double> %arg, <2 x double> %arg2) {
+  %bitcast = bitcast <2 x double> %arg to <2 x i64>
+  %and = and <2 x i64> %bitcast, <i64 2147483647, i64 2147483647>
+  %floatcast = bitcast <2 x i64> %and to <2 x double>
+  %max_is_x = fcmp oge <2 x double> %floatcast, %arg2
+  %max = select <2 x i1> %max_is_x, <2 x double> %floatcast, <2 x double> %arg2
+  ret <2 x double> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd256
+; CHECK: broadcastsd
+; CHECK: vandpd
+; CHECK: vmaxpd
+; CHECK: ret
+define <4 x double> @ExeDepsFix_broadcastsd256(<4 x double> %arg, <4 x double> %arg2) {
+  %bitcast = bitcast <4 x double> %arg to <4 x i64>
+  %and = and <4 x i64> %bitcast, <i64 2147483647, i64 2147483647, i64 2147483647, i64 2147483647>
+  %floatcast = bitcast <4 x i64> %and to <4 x double>
+  %max_is_x = fcmp oge <4 x double> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
+  ret <4 x double> %max
+}
+
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd_inreg
+; ExeDepsFix works top down, thus it coalesces vmovlhps domain with
+; vandps and there is nothing more you can do to match vmaxpd.
+; CHECK: vmovlhps
+; CHECK: vandps
+; CHECK: vmaxpd
+; CHECK: ret
+define <2 x double> @ExeDepsFix_broadcastsd_inreg(<2 x double> %arg, <2 x double> %arg2, i64 %broadcastvalue) {
+  %bitcast = bitcast <2 x double> %arg to <2 x i64>
+  %in = insertelement <2 x i64> undef, i64 %broadcastvalue, i32 0
+  %mask = shufflevector <2 x i64> %in, <2 x i64> undef, <2 x i32> zeroinitializer
+  %and = and <2 x i64> %bitcast, %mask
+  %floatcast = bitcast <2 x i64> %and to <2 x double>
+  %max_is_x = fcmp oge <2 x double> %floatcast, %arg2
+  %max = select <2 x i1> %max_is_x, <2 x double> %floatcast, <2 x double> %arg2
+  ret <2 x double> %max
+}
+
+; CHECK-LABEL: ExeDepsFix_broadcastsd256_inreg
+; CHECK: broadcastsd
+; CHECK: vandpd
+; CHECK: vmaxpd
+; CHECK: ret
+define <4 x double> @ExeDepsFix_broadcastsd256_inreg(<4 x double> %arg, <4 x double> %arg2, i64 %broadcastvalue) {
+  %bitcast = bitcast <4 x double> %arg to <4 x i64>
+  %in = insertelement <4 x i64> undef, i64 %broadcastvalue, i32 0
+  %mask = shufflevector <4 x i64> %in, <4 x i64> undef, <4 x i32> zeroinitializer
+  %and = and <4 x i64> %bitcast, %mask
+  %floatcast = bitcast <4 x i64> %and to <4 x double>
+  %max_is_x = fcmp oge <4 x double> %floatcast, %arg2
+  %max = select <4 x i1> %max_is_x, <4 x double> %floatcast, <4 x double> %arg2
+  ret <4 x double> %max
+}
+
diff --git a/test/CodeGen/X86/extract-store.ll b/test/CodeGen/X86/extract-store.ll
new file mode 100644
index 0000000..27d9380
--- /dev/null
+++ b/test/CodeGen/X86/extract-store.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse4.1 | FileCheck %s -check-prefix=SSE41
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+avx | FileCheck %s -check-prefix=AVX
+
+define void @pextrb(i8* nocapture %dst, <16 x i8> %foo) {
+; AVX: vpextrb
+; SSE41: pextrb
+; AVX-NOT: movb
+; SSE41-NOT: movb
+  %vecext = extractelement <16 x i8> %foo, i32 15
+  store i8 %vecext, i8* %dst, align 1
+  ret void
+}
+
+define void @pextrw(i16* nocapture %dst, <8 x i16> %foo) {
+; AVX: vpextrw
+; SSE41: pextrw
+; AVX-NOT: movw
+; SSE41-NOT: movw
+  %vecext = extractelement <8 x i16> %foo, i32 15
+  store i16 %vecext, i16* %dst, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/fast-isel-args-fail.ll b/test/CodeGen/X86/fast-isel-args-fail.ll
index e748e1c..7467edd 100644
--- a/test/CodeGen/X86/fast-isel-args-fail.ll
+++ b/test/CodeGen/X86/fast-isel-args-fail.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-apple-darwin10
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -fast-isel -verify-machineinstrs -mtriple=x86_64-pc-win64 | FileCheck %s -check-prefix=WIN64
-; Requires: Asserts
+; REQUIRES: asserts
 
 ; Previously, this would cause an assert.
 define i31 @t1(i31 %a, i31 %b, i31 %c) {
diff --git a/test/CodeGen/X86/fast-isel-select.ll b/test/CodeGen/X86/fast-isel-select.ll
new file mode 100644
index 0000000..53158bc
--- /dev/null
+++ b/test/CodeGen/X86/fast-isel-select.ll
@@ -0,0 +1,16 @@
+; RUN: llc -mtriple x86_64-apple-darwin -O0 -o - < %s | FileCheck %s
+; Make sure we only use the less significant bit of the value that feeds the
+; select. Otherwise, we may account for a non-zero value whereas the
+; lsb is zero.
+; <rdar://problem/15651765>
+
+; CHECK-LABEL: fastisel_select: 
+; CHECK: subb {{%[a-z0-9]+}}, [[RES:%[a-z0-9]+]]
+; CHECK: testb $1, [[RES]]
+; CHECK: cmovel
+define i32 @fastisel_select(i1 %exchSub2211_, i1 %trunc_8766) {
+  %shuffleInternal15257_8932 = sub i1 %exchSub2211_, %trunc_8766
+  %counter_diff1345 = select i1 %shuffleInternal15257_8932, i32 1204476887, i32 0
+  ret i32 %counter_diff1345
+}
+
diff --git a/test/CodeGen/X86/fast-isel-x86.ll b/test/CodeGen/X86/fast-isel-x86.ll
index ba86e88..a212a7c 100644
--- a/test/CodeGen/X86/fast-isel-x86.ll
+++ b/test/CodeGen/X86/fast-isel-x86.ll
@@ -3,7 +3,7 @@
 ; This should use flds to set the return value.
 ; CHECK-LABEL: test0:
 ; CHECK: flds
-; CHECK: ret
+; CHECK: retl
 @G = external global float
 define float @test0() nounwind {
   %t = load float* @G
@@ -12,7 +12,7 @@ define float @test0() nounwind {
 
 ; This should pop 4 bytes on return.
 ; CHECK-LABEL: test1:
-; CHECK: ret $4
+; CHECK: retl $4
 define void @test1({i32, i32, i32, i32}* sret %p) nounwind {
   store {i32, i32, i32, i32} zeroinitializer, {i32, i32, i32, i32}* %p
   ret void
@@ -25,7 +25,7 @@ define void @test1({i32, i32, i32, i32}* sret %p) nounwind {
 ; CHECK-NEXT: L2$pb:
 ; CHECK-NEXT: pop
 ; CHECK: HHH
-; CHECK: ret
+; CHECK: retl
 @HHH = external global i32
 define i32 @test2() nounwind {
   %t = load i32* @HHH
diff --git a/test/CodeGen/X86/fast-isel.ll b/test/CodeGen/X86/fast-isel.ll
index 132df2b..bc79184 100644
--- a/test/CodeGen/X86/fast-isel.ll
+++ b/test/CodeGen/X86/fast-isel.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2
-; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -march=x86 -mattr=sse2 -no-integrated-as
+; RUN: llc < %s -fast-isel -fast-isel-abort -verify-machineinstrs -mtriple=x86_64-apple-darwin10 -no-integrated-as
 
 ; This tests very minimal fast-isel functionality.
 
diff --git a/test/CodeGen/X86/fastcall-correct-mangling.ll b/test/CodeGen/X86/fastcall-correct-mangling.ll
index 3569d36..00dc44e 100644
--- a/test/CodeGen/X86/fastcall-correct-mangling.ll
+++ b/test/CodeGen/X86/fastcall-correct-mangling.ll
@@ -1,14 +1,33 @@
-; RUN: llc < %s -mtriple=i386-unknown-mingw32 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-unknown-mingw32 | \
+; RUN: FileCheck --check-prefix=CHECK32 %s
+
+; RUN: llc < %s -mtriple=i386-unknown-win32 | \
+; RUN: FileCheck --check-prefix=CHECK32 %s
+
+; RUN: llc < %s -mtriple=x86_64-unknown-mingw32 | \
+; RUN: FileCheck --check-prefix=CHECK64 %s
+
+; RUN: llc < %s -mtriple=x86_64-unknown-mingw32 | \
+; RUN: FileCheck --check-prefix=CHECK64 %s
 
 ; Check that a fastcall function gets correct mangling
 
 define x86_fastcallcc void @func(i64 %X, i8 %Y, i8 %G, i16 %Z) {
-; CHECK: @func@20:
+; CHECK32-LABEL: {{^}}@func@20:
+; CHECK64-LABEL: {{^}}func:
         ret void
 }
 
 define x86_fastcallcc i32 @"\01DoNotMangle"(i32 %a) {
-; CHECK: DoNotMangle:
+; CHECK32-LABEL: {{^}}DoNotMangle:
+; CHECK64-LABEL: {{^}}DoNotMangle:
 entry:
   ret i32 %a
 }
+
+define private x86_fastcallcc void @dontCrash() {
+; The name is fairly arbitrary since it is private. Just don't crash.
+; CHECK32-LABEL: {{^}}L@dontCrash@0:
+; CHECK64-LABEL: {{^}}.LdontCrash:
+  ret void
+}
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index 917eac0..47252ec 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -42,6 +42,39 @@ entry:
   ret float %call
 }
 
+; Test FMA3 variant selection
+; CHECK: fma3_select231ssX:
+; CHECK: vfmadd231ss xmm
+define float @fma3_select231ssX(float %x, float %y) #0 {
+entry:
+  br label %while.body
+while.body:                                       ; preds = %while.body, %while.body
+  %acc.01 = phi float [ 0.000000e+00, %entry ], [ %acc, %while.body ]
+  %acc = tail call float @llvm.fma.f32(float %x, float %y, float %acc.01) nounwind readnone
+  %b = fcmp ueq float %acc, 0.0
+  br i1 %b, label %while.body, label %while.end
+while.end:                                        ; preds = %while.body, %entry
+  ret float %acc
+}
+
+; Test FMA3 variant selection
+; CHECK: fma3_select231pdY:
+; CHECK: vfmadd231pd ymm
+define <4 x double> @fma3_select231pdY(<4 x double> %x, <4 x double> %y) #0 {
+entry:
+  br label %while.body
+while.body:                                       ; preds = %entry, %while.body
+  %acc.04 = phi <4 x double> [ zeroinitializer, %entry ], [ %add, %while.body ]
+  %add = tail call <4 x double> @llvm.fma.v4f64(<4 x double> %x, <4 x double> %y, <4 x double> %acc.04)
+  %vecext = extractelement <4 x double> %add, i32 0
+  %cmp = fcmp oeq double %vecext, 0.000000e+00
+  br i1 %cmp, label %while.body, label %while.end
+
+while.end:                                        ; preds = %while.body
+  ret <4 x double> %add
+}
+
 declare float @llvm.fma.f32(float, float, float) nounwind readnone
 declare double @llvm.fma.f64(double, double, double) nounwind readnone
 declare x86_fp80 @llvm.fma.f80(x86_fp80, x86_fp80, x86_fp80) nounwind readnone
+declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>) nounwind readnone
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index e3910a6..9a25096 100644
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -3,7 +3,7 @@
 ; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fmadd213ss %xmm
+  ; CHECK: fmadd213ss (%r8), %xmm
   %res = call <4 x float> @llvm.x86.fma.vfmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
@@ -24,7 +24,7 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>) nounwind readnone
 
 define <4 x float> @test_x86_fnmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
-  ; CHECK: fnmadd213ss %xmm
+  ; CHECK: fnmadd213ss (%r8), %xmm
   %res = call <4 x float> @llvm.x86.fma.vfnmadd.ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) nounwind
   ret <4 x float> %res
 }
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index 7a1a9ae..494cb28 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mcpu=corei7-avx -mattr=+fma4 | FileCheck %s
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
diff --git a/test/CodeGen/X86/fold-call-oper.ll b/test/CodeGen/X86/fold-call-oper.ll
new file mode 100644
index 0000000..94e2a6f
--- /dev/null
+++ b/test/CodeGen/X86/fold-call-oper.ll
@@ -0,0 +1,48 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+;
+; PR18396: Assertion: MO->isDead "Cannot fold physreg def".
+; InlineSpiller::foldMemoryOperand needs to handle undef call operands.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+
+@a = external global i32**, align 8
+@b = external global i32, align 4
+
+; Check that the call targets are folded, and we don't crash!
+; CHECK-LABEL: foldCallOper:
+; CHECK: callq *{{.*}}(%rbp)
+; CHECK: callq *{{.*}}(%rbp)
+define void @foldCallOper(i32 (i32*, i32, i32**)* nocapture %p1) #0 {
+entry:
+  %0 = load i32*** @a, align 8
+  br label %for.body.i
+
+for.body.i:                                       ; preds = %for.body.i, %entry
+  %exitcond5.i = icmp eq i32 undef, undef
+  br i1 %exitcond5.i, label %for.body3.lr.ph.i, label %for.body.i
+
+for.body3.lr.ph.i:                                ; preds = %for.body.i
+  %call.i = tail call i32 %p1(i32* undef, i32 0, i32** null)
+  %tobool.i = icmp eq i32 %call.i, 0
+  br label %for.body3.i
+
+for.body3.i:                                      ; preds = %for.inc8.i, %for.body3.lr.ph.i
+  %1 = phi i32* [ undef, %for.body3.lr.ph.i ], [ %.pre.i, %for.inc8.i ]
+  %indvars.iv.i = phi i64 [ 1, %for.body3.lr.ph.i ], [ %phitmp.i, %for.inc8.i ]
+  %call5.i = tail call i32 %p1(i32* %1, i32 0, i32** %0)
+  br i1 %tobool.i, label %for.inc8.i, label %if.then.i
+
+if.then.i:                                        ; preds = %for.body3.i
+  %2 = load i32* %1, align 4
+  store i32 %2, i32* @b, align 4
+  br label %for.inc8.i
+
+for.inc8.i:                                       ; preds = %if.then.i, %for.body3.i
+  %lftr.wideiv.i = trunc i64 %indvars.iv.i to i32
+  %arrayidx4.phi.trans.insert.i = getelementptr inbounds [0 x i32*]* undef, i64 0, i64 %indvars.iv.i
+  %.pre.i = load i32** %arrayidx4.phi.trans.insert.i, align 8
+  %phitmp.i = add i64 %indvars.iv.i, 1
+  br label %for.body3.i
+}
+
+attributes #0 = { noreturn uwtable "no-frame-pointer-elim"="true" }
diff --git a/test/CodeGen/X86/fold-vector-sext-crash.ll b/test/CodeGen/X86/fold-vector-sext-crash.ll
new file mode 100644
index 0000000..52ea7a9
--- /dev/null
+++ b/test/CodeGen/X86/fold-vector-sext-crash.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -mcpu=core-avx-i -mtriple=i386-unknown-linux-gnu -mattr=+avx,+popcnt,+cmov
+
+; Make sure that we don't introduce illegal build_vector dag nodes
+; when trying to fold a sign_extend of a constant build_vector.
+; After r200234 the test case below was crashing the compiler with an assertion failure
+; due to an illegal build_vector of type MVT::v4i64.
+
+define <4 x i64> @foo(<4 x i64> %A) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x i64> %A, <4 x i64><i64 undef, i64 undef, i64 0, i64 0>
+  ret <4 x i64> %1
+}
+
diff --git a/test/CodeGen/X86/fold-vector-sext-zext.ll b/test/CodeGen/X86/fold-vector-sext-zext.ll
new file mode 100644
index 0000000..aeaab44
--- /dev/null
+++ b/test/CodeGen/X86/fold-vector-sext-zext.ll
@@ -0,0 +1,291 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx -mattr=+avx | FileCheck %s
+
+; Verify that the backend correctly folds a sign/zero extend of a vector where
+; elements are all constant values or UNDEFs.
+; The backend should be able to optimize all the test functions below into
+; simple loads from constant pool of the result. That is because the resulting
+; vector should be known at static time.
+
+
+define <4 x i16> @test1() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test1
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i16> @test2() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test2
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test3() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test3
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test4() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test4
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+
+define <4 x i64> @test5() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i64> @test6() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = sext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test7() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test7
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test8() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test9() {
+  %1 = insertelement <8 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 undef, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 undef, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test9
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test10() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 undef, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 undef, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 undef, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 undef, i32 7
+  %9 = sext <8 x i8> %4 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+
+define <4 x i16> @test11() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test11
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test12() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test12
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i64> @test13() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i16> @test14() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <4 x i8> %3, i8 -3, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i16>
+  ret <4 x i16> %5
+}
+; CHECK-LABEL: test14
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test15() {
+  %1 = insertelement <4 x i8> undef, i8 0, i32 0
+  %2 = insertelement <4 x i8> %1, i8 undef, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 undef, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i32>
+  ret <4 x i32> %5
+}
+; CHECK-LABEL: test15
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <4 x i64> @test16() {
+  %1 = insertelement <4 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <4 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <4 x i8> %2, i8 2, i32 2
+  %4 = insertelement <4 x i8> %3, i8 undef, i32 3
+  %5 = zext <4 x i8> %4 to <4 x i64>
+  ret <4 x i64> %5
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test17() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test17
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test18() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test18
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test19() {
+  %1 = insertelement <8 x i8> undef, i8 undef, i32 0
+  %2 = insertelement <8 x i8> %1, i8 -1, i32 1
+  %3 = insertelement <8 x i8> %2, i8 undef, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 undef, i32 4
+  %6 = insertelement <8 x i8> %5, i8 -5, i32 5
+  %7 = insertelement <8 x i8> %6, i8 undef, i32 6
+  %8 = insertelement <8 x i8> %7, i8 -7, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i16>
+  ret <8 x i16> %9
+}
+; CHECK-LABEL: test19
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
+define <8 x i32> @test20() {
+  %1 = insertelement <8 x i8> undef, i8 0, i32 0
+  %2 = insertelement <8 x i8> %1, i8 undef, i32 1
+  %3 = insertelement <8 x i8> %2, i8 2, i32 2
+  %4 = insertelement <8 x i8> %3, i8 -3, i32 3
+  %5 = insertelement <8 x i8> %4, i8 4, i32 4
+  %6 = insertelement <8 x i8> %5, i8 undef, i32 5
+  %7 = insertelement <8 x i8> %6, i8 6, i32 6
+  %8 = insertelement <8 x i8> %7, i8 undef, i32 7
+  %9 = zext <8 x i8> %8 to <8 x i32>
+  ret <8 x i32> %9
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: vinsertf128
+; CHECK: vmovaps
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/fold-xmm-zero.ll b/test/CodeGen/X86/fold-xmm-zero.ll
index b4eeb40..c92d45c 100644
--- a/test/CodeGen/X86/fold-xmm-zero.ll
+++ b/test/CodeGen/X86/fold-xmm-zero.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-macosx10.6.7 -mattr=+sse2 -no-integrated-as | FileCheck %s
 
 ; Simple test to make sure folding for special constants (like float zero)
 ; isn't completely broken.
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
index 07baca8..7b08ad6 100644
--- a/test/CodeGen/X86/fp-fast.ll
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 -mattr=+avx,-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7-avx -enable-unsafe-fp-math < %s | FileCheck %s
 
 ; CHECK-LABEL: test1
 define float @test1(float %a) {
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index fcc4e9f..7a29b07 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -1,12 +1,35 @@
-; RUN: llc -mtriple x86_64-apple-darwin %s -o - | FileCheck %s
+; RUN: llc -mtriple x86_64-apple-darwin %s -o -   | FileCheck %s   --check-prefix=APPLE
+; RUN: llc -mtriple x86_64-pc-windows-gnu %s -o - | FileCheck %s   --check-prefix=MINGW64
+; RUN: llc -mtriple i686-pc-windows-gnu %s -o -   | FileCheck %s   --check-prefix=MINGW32
 @_ZTIi = external constant i8*
 
 define i32 @main() uwtable optsize ssp {
-; CHECK: .cfi_startproc
-; CHECK: .cfi_personality 155, ___gxx_personality_v0
-; CHECK: .cfi_lsda 16, Lexception0
-; CHECK: .cfi_def_cfa_offset 16
-; CHECK: .cfi_endproc
+; APPLE: .cfi_startproc
+; APPLE: .cfi_personality 155, ___gxx_personality_v0
+; APPLE: .cfi_lsda 16, Lexception0
+; APPLE: .cfi_def_cfa_offset 16
+; APPLE: callq __Unwind_Resume
+; APPLE: .cfi_endproc
+; APPLE: GCC_except_table0:
+; APPLE: Lexception0:
+
+; MINGW64: .cfi_startproc
+; MINGW64: .cfi_personality 0, __gxx_personality_v0
+; MINGW64: .cfi_lsda 0, .Lexception0
+; MINGW64: .cfi_def_cfa_offset 16
+; MINGW64: callq _Unwind_Resume
+; MINGW64: .cfi_endproc
+; MINGW64: GCC_except_table0:
+; MINGW64: Lexception0:
+
+; MINGW32: .cfi_startproc
+; MINGW32: .cfi_personality 0, ___gxx_personality_v0
+; MINGW32: .cfi_lsda 0, Lexception0
+; MINGW32: .cfi_def_cfa_offset 8
+; MINGW32: calll __Unwind_Resume
+; MINGW32: .cfi_endproc
+; MINGW32: GCC_except_table0:
+; MINGW32: Lexception0:
 
 entry:
   invoke void @_Z1fv() optsize
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index d8743ac..5ad5047 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -1,7 +1,16 @@
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
+; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
 ; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -fdata-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i686-pc-win32 -fdata-sections -ffunction-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
+define void @F1() {
+  ret void
+}
+
+; WIN32-SECTIONS: .section        .text,"xr",one_only,_F1
+; WIN32-SECTIONS: .globl _F1
 
 ; int G1;
 @G1 = common global i32 0
@@ -39,6 +48,9 @@
 ; LINUX-SECTIONS: .section        .rodata.G3,"a",@progbits
 ; LINUX-SECTIONS: .globl  G3
 
+; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G3
+; WIN32-SECTIONS: .globl  _G3
+
 
 ; _Complex long long const G4 = 34;
 @G4 = unnamed_addr constant {i64,i64} { i64 34, i64 0 }
@@ -47,6 +59,14 @@
 ; DARWIN: _G4:
 ; DARWIN:     .long 34
 
+; DARWIN-STATIC: .section        __TEXT,__literal16,16byte_literals
+; DARWIN-STATIC: _G4:
+; DARWIN-STATIC:     .long 34
+
+; DARWIN64: .section        __TEXT,__literal16,16byte_literals
+; DARWIN64: _G4:
+; DARWIN64:     .quad 34
+
 
 ; int G5 = 47;
 @G5 = global i32 47
@@ -107,6 +127,9 @@
 ; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
 ; LINUX-SECTIONS:	.globl G7
 
+; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G7
+; WIN32-SECTIONS:	.globl _G7
+
 
 @G8 = unnamed_addr constant [4 x i16] [ i16 1, i16 2, i16 3, i16 0 ]
 
@@ -158,3 +181,16 @@
 ; DARWIN: .zerofill __DATA,__common,_G12,1,3
 ; DARWIN: .globl _G13
 ; DARWIN: .zerofill __DATA,__common,_G13,1,3
+
+@G14 = private unnamed_addr constant [4 x i8] c"foo\00", align 1
+
+; LINUX-SECTIONS:        .type   .LG14,@object           # @G14
+; LINUX-SECTIONS:        .section        .rodata..LG14,"aMS",@progbits,1
+; LINUX-SECTIONS: .LG14:
+; LINUX-SECTIONS:        .asciz  "foo"
+; LINUX-SECTIONS:        .size   .LG14, 4
+
+; WIN32-SECTIONS:        .section        .rdata,"r"
+; WIN32-SECTIONS: L_G14:
+; WIN32-SECTIONS:        .asciz  "foo"
+
diff --git a/test/CodeGen/X86/hidden-vis-pic.ll b/test/CodeGen/X86/hidden-vis-pic.ll
index 67be3d0..1caab7a 100644
--- a/test/CodeGen/X86/hidden-vis-pic.ll
+++ b/test/CodeGen/X86/hidden-vis-pic.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin9 -relocation-model=pic -disable-fp-elim | FileCheck %s
 
 
 
@@ -48,8 +48,3 @@ return:                                           ; preds = %entry
   %retval1 = load i32* %retval                    ; <i32> [#uses=1]
   ret i32 %retval1
 }
-
-; CHECK: .private_extern _func.eh
-; CHECK: .private_extern _main.eh
-
-
diff --git a/test/CodeGen/X86/i64-mem-copy.ll b/test/CodeGen/X86/i64-mem-copy.ll
index dce12ae..bf77896 100644
--- a/test/CodeGen/X86/i64-mem-copy.ll
+++ b/test/CodeGen/X86/i64-mem-copy.ll
@@ -3,7 +3,7 @@
 ; X64: movq ({{%rsi|%rdx}}), %r
 
 ; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s -check-prefix=X32
-; X32: movsd (%eax), %xmm
+; X32: movsd ({{%ecx|%eax}}), %xmm
 
 ; Uses movsd to load / store i64 values if sse2 is available.
 
diff --git a/test/CodeGen/X86/inalloca-ctor.ll b/test/CodeGen/X86/inalloca-ctor.ll
new file mode 100644
index 0000000..7cfa929
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-ctor.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Foo = type { i32, i32 }
+
+%frame = type { %Foo, i32, %Foo }
+
+declare void @f(%frame* inalloca %a)
+
+declare void @Foo_ctor(%Foo* %this)
+
+define void @g() {
+entry:
+  %args = alloca inalloca %frame
+  %c = getelementptr %frame* %args, i32 0, i32 2
+; CHECK: movl    $20, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl %esp,
+  call void @Foo_ctor(%Foo* %c)
+; CHECK: leal 12(%{{.*}}),
+; CHECK: subl $4, %esp
+; CHECK: calll _Foo_ctor
+; CHECK: addl $4, %esp
+  %b = getelementptr %frame* %args, i32 0, i32 1
+  store i32 42, i32* %b
+; CHECK: movl $42,
+  %a = getelementptr %frame* %args, i32 0, i32 0
+  call void @Foo_ctor(%Foo* %a)
+; CHECK: subl $4, %esp
+; CHECK: calll _Foo_ctor
+; CHECK: addl $4, %esp
+  call void @f(%frame* inalloca %args)
+; CHECK: calll   _f
+  ret void
+}
diff --git a/test/CodeGen/X86/inalloca-invoke.ll b/test/CodeGen/X86/inalloca-invoke.ll
new file mode 100644
index 0000000..6cff9ac
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-invoke.ll
@@ -0,0 +1,54 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Iter = type { i32, i32, i32 }
+
+%frame.reverse = type { %Iter, %Iter }
+
+declare void @llvm.stackrestore(i8*)
+declare i8* @llvm.stacksave()
+declare void @begin(%Iter* sret)
+declare void @plus(%Iter* sret, %Iter*, i32)
+declare void @reverse(%frame.reverse* inalloca align 4)
+
+define i32 @main() {
+  %temp.lvalue = alloca %Iter
+  br label %blah
+
+blah:
+  %inalloca.save = call i8* @llvm.stacksave()
+  %rev_args = alloca inalloca %frame.reverse, align 4
+  %beg = getelementptr %frame.reverse* %rev_args, i32 0, i32 0
+  %end = getelementptr %frame.reverse* %rev_args, i32 0, i32 1
+
+; CHECK:  calll   __chkstk
+; CHECK:  movl    %[[beg:[^,]*]], %esp
+; CHECK:  leal    12(%[[beg]]), %[[end:[^ ]*]]
+
+  call void @begin(%Iter* sret %temp.lvalue)
+; CHECK:  calll _begin
+
+  invoke void @plus(%Iter* sret %end, %Iter* %temp.lvalue, i32 4)
+          to label %invoke.cont unwind label %lpad
+
+;  Uses end as sret param.
+; CHECK:  movl %[[end]], (%esp)
+; CHECK:  calll _plus
+
+invoke.cont:
+  call void @begin(%Iter* sret %beg)
+
+; CHECK:  movl %[[beg]],
+; CHECK:  calll _begin
+
+  invoke void @reverse(%frame.reverse* inalloca align 4 %rev_args)
+          to label %invoke.cont5 unwind label %lpad
+
+invoke.cont5:                                     ; preds = %invoke.cont
+  call void @llvm.stackrestore(i8* %inalloca.save)
+  ret i32 0
+
+lpad:                                             ; preds = %invoke.cont, %entry
+  %lp = landingpad { i8*, i32 } personality i8* null
+          cleanup
+  unreachable
+}
diff --git a/test/CodeGen/X86/inalloca-stdcall.ll b/test/CodeGen/X86/inalloca-stdcall.ll
new file mode 100644
index 0000000..54f97d9
--- /dev/null
+++ b/test/CodeGen/X86/inalloca-stdcall.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Foo = type { i32, i32 }
+
+declare x86_stdcallcc void @f(%Foo* inalloca %a)
+declare x86_stdcallcc void @i(i32 %a)
+
+define void @g() {
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK: movl    $13, (%[[REG]])
+; CHECK: movl    $42, 4(%[[REG]])
+  call x86_stdcallcc void @f(%Foo* inalloca %b)
+; CHECK: calll   _f@8
+; CHECK-NOT: %esp
+; CHECK: subl $4, %esp
+; CHECK: calll   _i@4
+  call x86_stdcallcc void @i(i32 0)
+  ret void
+}
diff --git a/test/CodeGen/X86/inalloca.ll b/test/CodeGen/X86/inalloca.ll
new file mode 100644
index 0000000..12643f9
--- /dev/null
+++ b/test/CodeGen/X86/inalloca.ll
@@ -0,0 +1,65 @@
+; RUN: llc < %s -mtriple=i686-pc-win32 | FileCheck %s
+
+%Foo = type { i32, i32 }
+
+declare void @f(%Foo* inalloca %b)
+
+define void @a() {
+; CHECK-LABEL: _a:
+entry:
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK: movl    $13, (%[[REG]])
+; CHECK: movl    $42, 4(%[[REG]])
+  call void @f(%Foo* inalloca %b)
+; CHECK: calll   _f
+  ret void
+}
+
+declare void @inreg_with_inalloca(i32 inreg %a, %Foo* inalloca %b)
+
+define void @b() {
+; CHECK-LABEL: _b:
+entry:
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK: movl    $13, (%[[REG]])
+; CHECK: movl    $42, 4(%[[REG]])
+  call void @inreg_with_inalloca(i32 inreg 1, %Foo* inalloca %b)
+; CHECK: movl    $1, %eax
+; CHECK: calll   _inreg_with_inalloca
+  ret void
+}
+
+declare x86_thiscallcc void @thiscall_with_inalloca(i8* %a, %Foo* inalloca %b)
+
+define void @c() {
+; CHECK-LABEL: _c:
+entry:
+  %b = alloca inalloca %Foo
+; CHECK: movl    $8, %eax
+; CHECK: calll   __chkstk
+; CHECK: movl   %[[REG:[^,]*]], %esp
+  %f1 = getelementptr %Foo* %b, i32 0, i32 0
+  %f2 = getelementptr %Foo* %b, i32 0, i32 1
+  store i32 13, i32* %f1
+  store i32 42, i32* %f2
+; CHECK-DAG: movl    $13, (%[[REG]])
+; CHECK-DAG: movl    $42, 4(%[[REG]])
+  call x86_thiscallcc void @thiscall_with_inalloca(i8* null, %Foo* inalloca %b)
+; CHECK-DAG: xorl    %ecx, %ecx
+; CHECK: calll   _thiscall_with_inalloca
+  ret void
+}
diff --git a/test/CodeGen/X86/inline-asm-flag-clobber.ll b/test/CodeGen/X86/inline-asm-flag-clobber.ll
index 45f4d2f..bb7c33e 100644
--- a/test/CodeGen/X86/inline-asm-flag-clobber.ll
+++ b/test/CodeGen/X86/inline-asm-flag-clobber.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -march=x86-64 -no-integrated-as < %s | FileCheck %s
 ; PR3701
 
 define i64 @t(i64* %arg) nounwind {
diff --git a/test/CodeGen/X86/inline-asm-fpstack.ll b/test/CodeGen/X86/inline-asm-fpstack.ll
index e83c065..91c477b 100644
--- a/test/CodeGen/X86/inline-asm-fpstack.ll
+++ b/test/CodeGen/X86/inline-asm-fpstack.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -mcpu=generic -mtriple=i386-apple-darwin -no-integrated-as | FileCheck %s
 
 ; There should be no stack manipulations between the inline asm and ret.
 ; CHECK: test1
diff --git a/test/CodeGen/X86/inline-asm-h.ll b/test/CodeGen/X86/inline-asm-h.ll
index 53cf419..8c3e45a 100644
--- a/test/CodeGen/X86/inline-asm-h.ll
+++ b/test/CodeGen/X86/inline-asm-h.ll
@@ -9,4 +9,4 @@ entry:
 }
 
 ; CHECK: zed
-; CHECK: movq %mm2,foobar+8(%rip)
+; CHECK: movq %mm2, foobar+8(%rip)
diff --git a/test/CodeGen/X86/inline-asm-modifier-n.ll b/test/CodeGen/X86/inline-asm-modifier-n.ll
index b069c46..072c7c4 100644
--- a/test/CodeGen/X86/inline-asm-modifier-n.ll
+++ b/test/CodeGen/X86/inline-asm-modifier-n.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | grep " 37"
+; RUN: llc < %s -march=x86 -no-integrated-as | grep " 37"
 ; rdar://7008959
 
 define void @bork() nounwind {
diff --git a/test/CodeGen/X86/inline-asm-modifier-q.ll b/test/CodeGen/X86/inline-asm-modifier-q.ll
new file mode 100644
index 0000000..8063d48
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-modifier-q.ll
@@ -0,0 +1,12 @@
+; RUN: llc < %s -march=x86 -no-integrated-as | FileCheck %s
+
+; If the target does not have 64-bit integer registers, emit 32-bit register
+; names.
+
+; CHECK: movq (%e{{[abcd]}}x, %ebx, 4)
+
+define void @q_modifier(i32* %p) {
+entry:
+  tail call void asm sideeffect "movq (${0:q}, %ebx, 4), %mm0", "r,~{dirflag},~{fpsr},~{flags}"(i32* %p)
+  ret void
+}
diff --git a/test/CodeGen/X86/inline-asm-mrv.ll b/test/CodeGen/X86/inline-asm-mrv.ll
index 733205d..a96e7b8 100644
--- a/test/CodeGen/X86/inline-asm-mrv.ll
+++ b/test/CodeGen/X86/inline-asm-mrv.ll
@@ -1,8 +1,8 @@
 ; PR2094
-; RUN: llc < %s -march=x86-64 | grep movslq
-; RUN: llc < %s -march=x86-64 | grep addps
-; RUN: llc < %s -march=x86-64 | grep paddd
-; RUN: llc < %s -march=x86-64 | not grep movq
+; RUN: llc < %s -march=x86-64 -no-integrated-as | grep movslq
+; RUN: llc < %s -march=x86-64 -no-integrated-as | grep addps
+; RUN: llc < %s -march=x86-64 -no-integrated-as | grep paddd
+; RUN: llc < %s -march=x86-64 -no-integrated-as | not grep movq
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128"
 target triple = "x86_64-apple-darwin8"
diff --git a/test/CodeGen/X86/inline-asm-q-regs.ll b/test/CodeGen/X86/inline-asm-q-regs.ll
index fca68ba..53a56ae 100644
--- a/test/CodeGen/X86/inline-asm-q-regs.ll
+++ b/test/CodeGen/X86/inline-asm-q-regs.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+avx
+; RUN: llc < %s -march=x86-64 -mattr=+avx -no-integrated-as
 ; rdar://7066579
 
 	%0 = type { i64, i64, i64, i64, i64 }		; type %0
diff --git a/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll b/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
new file mode 100644
index 0000000..b55571b
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-sp-clobber-memcpy.ll
@@ -0,0 +1,17 @@
+; RUN: llc < %s -force-align-stack -mtriple i386-apple-darwin -mcpu=i486 | FileCheck %s
+
+%struct.foo = type { [88 x i8] }
+
+declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
+
+; PR19012
+; Don't clobber %esi if we have inline asm that clobbers %esp.
+define void @test1(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+  call void @bar(i8* %z, %struct.foo* align 4 byval %x)
+  call void asm sideeffect inteldialect "xor esp, esp", "=*m,~{flags},~{esp},~{esp},~{dirflag},~{fpsr},~{flags}"(i8* %z)
+  ret void
+
+; CHECK-LABEL: test1:
+; CHECK: movl %esp, %esi
+; CHECK-NOT: rep;movsl
+}
diff --git a/test/CodeGen/X86/inline-asm-stack-realign.ll b/test/CodeGen/X86/inline-asm-stack-realign.ll
new file mode 100644
index 0000000..f2ac0f4
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-stack-realign.ll
@@ -0,0 +1,16 @@
+; RUN: not llc -mtriple=i686-pc-win32 < %s 2>&1 | FileCheck %s
+
+; FIXME: This is miscompiled due to our unconditional use of ESI as the base
+; pointer.
+; XFAIL: *
+
+; CHECK: Stack realignment in presence of dynamic stack adjustments is not supported with inline assembly
+
+define i32 @foo() {
+entry:
+  %r = alloca i32, align 16
+  store i32 -1, i32* %r, align 16
+  call void asm sideeffect inteldialect "push esi\0A\09xor esi, esi\0A\09mov dword ptr $0, esi\0A\09pop esi", "=*m,~{flags},~{esi},~{esp},~{dirflag},~{fpsr},~{flags}"(i32* %r)
+  %0 = load i32* %r, align 16
+  ret i32 %0
+}
diff --git a/test/CodeGen/X86/inline-asm-stack-realign2.ll b/test/CodeGen/X86/inline-asm-stack-realign2.ll
new file mode 100644
index 0000000..0e4e7e1
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-stack-realign2.ll
@@ -0,0 +1,16 @@
+; RUN: not llc -mtriple=i686-pc-win32 < %s 2>&1 | FileCheck %s
+
+; FIXME: This is miscompiled due to our unconditional use of ESI as the base
+; pointer.
+; XFAIL: *
+
+; CHECK: Stack realignment in presence of dynamic stack adjustments is not supported with inline assembly
+
+define i32 @foo() {
+entry:
+  %r = alloca i32, align 16
+  store i32 -1, i32* %r, align 16
+  call void asm sideeffect "push %esi\0A\09xor %esi, %esi\0A\09mov %esi, $0\0A\09pop %esi", "=*m,~{flags},~{esi},~{esp},~{dirflag},~{fpsr},~{flags}"(i32* %r)
+  %0 = load i32* %r, align 16
+  ret i32 %0
+}
diff --git a/test/CodeGen/X86/inline-asm-stack-realign3.ll b/test/CodeGen/X86/inline-asm-stack-realign3.ll
new file mode 100644
index 0000000..3baaaaa
--- /dev/null
+++ b/test/CodeGen/X86/inline-asm-stack-realign3.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=x86 -no-integrated-as < %s | FileCheck %s
+
+declare void @bar(i32* %junk)
+
+define i32 @foo(i1 %cond) {
+entry:
+  %r = alloca i32, align 128
+  store i32 -1, i32* %r, align 128
+  br i1 %cond, label %doit, label %skip
+
+doit:
+  call void asm sideeffect "xor %ecx, %ecx\0A\09mov %ecx, $0", "=*m,~{ecx},~{flags}"(i32* %r)
+  %junk = alloca i32
+  call void @bar(i32* %junk)
+  br label %skip
+
+skip:
+  %0 = load i32* %r, align 128
+  ret i32 %0
+}
+
+; CHECK-LABEL: foo:
+; CHECK: pushl %ebp
+; CHECK: andl $-128, %esp
+; CHECK: xor %ecx, %ecx
+; CHECK-NEXT: mov %ecx, (%esi)
+; CHECK: movl (%esi), %eax
+; CHECK: popl %ebp
+; CHECK: ret
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 597236e..fb5896b 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic | FileCheck %s
+; RUN: llc < %s -mtriple=i386-apple-darwin9 -O0 -optimize-regalloc -regalloc=basic -no-integrated-as | FileCheck %s
 ; rdar://6992609
 
 ; CHECK: movl [[EDX:%e..]], 4(%esp)
diff --git a/test/CodeGen/X86/inline-asm-x-scalar.ll b/test/CodeGen/X86/inline-asm-x-scalar.ll
index 5a9628b..64a7fe8 100644
--- a/test/CodeGen/X86/inline-asm-x-scalar.ll
+++ b/test/CodeGen/X86/inline-asm-x-scalar.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mcpu=yonah
+; RUN: llc < %s -march=x86 -mcpu=yonah -no-integrated-as
 
 define void @test1() {
         tail call void asm sideeffect "ucomiss $0", "x"( float 0x41E0000000000000)
diff --git a/test/CodeGen/X86/inline-asm.ll b/test/CodeGen/X86/inline-asm.ll
index d201ebd..5ec4f46 100644
--- a/test/CodeGen/X86/inline-asm.ll
+++ b/test/CodeGen/X86/inline-asm.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 
 define i32 @test1() nounwind {
 	; Dest is AX, dest type = i32.
@@ -59,3 +59,18 @@ entry:
   %asm = tail call i32 asm sideeffect "", "={ax},i,~{eax},~{flags},~{rax}"(i64 61) nounwind
   ret i32 %asm
 }
+
+@test8_v = global i32 42
+
+define void @test8() {
+  call void asm sideeffect "${0:P}", "i"( i32* @test8_v )
+  ret void
+}
+
+define void @test9() {
+  call void asm sideeffect "${0:P}", "X"( i8* blockaddress(@test9, %bb) )
+  br label %bb
+
+bb:
+  ret void
+}
diff --git a/test/CodeGen/X86/ins_split_regalloc.ll b/test/CodeGen/X86/ins_split_regalloc.ll
new file mode 100644
index 0000000..f5c5254
--- /dev/null
+++ b/test/CodeGen/X86/ins_split_regalloc.ll
@@ -0,0 +1,33 @@
+; RUN: llc -O1 -regalloc=greedy -mtriple=x86_64-apple-macosx -march x86-64  < %s -o - | FileCheck %s
+; Check that last chance split (RAGreedy::tryInstructonSplit) just split
+; when this is beneficial, otherwise we end up with uncoalesced copies.
+; <rdar://problem/15570057> 
+
+target datalayout = "e-i64:64-f80:128-s:64-n8:16:32:64-S128"
+
+@f = external constant void (i32)*
+
+; CHECK-LABEL: test:
+; Get the address of f in the GOT.
+; CHECK: movq _f@{{[^,]+}}, [[F_ENTRY_ADDR:%[a-z0-9]+]]
+; Read the actual address of f.
+; CHECK: movq ([[F_ENTRY_ADDR]]), [[F_ADDR:%[a-z0-9]+]]
+; Check that we do not have useless split points before each call.
+; CHECK-NOT: movq
+; CHECK: callq *[[F_ADDR]]
+; Check that we do not have useless split points before each call.
+; CHECK-NOT: movq
+; CHECK: callq *[[F_ADDR]]
+; Last call is a tail call, thus the address of the function cannot use
+; a callee saved register.
+; CHECK: movq [[F_ADDR]], [[F_ADDR_TC:%[a-z0-9]+]]
+; CHECK: popq [[F_ADDR]]
+; CHECK: jmpq *[[F_ADDR_TC]]
+define void @test(i32 %a, i32 %b, i32 %c) {
+entry:
+  %fct_f = load void (i32)** @f, align 8
+  tail call void %fct_f(i32 %a)
+  tail call void %fct_f(i32 %b)
+  tail call void %fct_f(i32 %c)
+  ret void
+}
diff --git a/test/CodeGen/X86/isint.ll b/test/CodeGen/X86/isint.ll
index 4a98e63..38d05c6 100644
--- a/test/CodeGen/X86/isint.ll
+++ b/test/CodeGen/X86/isint.ll
@@ -1,6 +1,11 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-unknown -mattr=+sse2 -mcpu=penryn | FileCheck %s
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 -mcpu=penryn | FileCheck %s
+
+; PR19059
+; RUN: llc < %s -mtriple=i686-pc-unknown -mattr=+sse2 -mcpu=penryn | FileCheck -check-prefix=CHECK32 %s
 
 define i32 @isint_return(double %d) nounwind {
+; CHECK-LABEL: isint_return:
 ; CHECK-NOT: xor
 ; CHECK: cvt
   %i = fptosi double %d to i32
@@ -8,6 +13,24 @@ define i32 @isint_return(double %d) nounwind {
   %e = sitofp i32 %i to double
 ; CHECK: cmpeqsd
   %c = fcmp oeq double %d, %e
+; CHECK32-NOT: movd {{.*}}, %r{{.*}}
+; CHECK32-NOT: andq
+; CHECK-NEXT: movd
+; CHECK-NEXT: andl
+  %z = zext i1 %c to i32
+  ret i32 %z
+}
+
+define i32 @isint_float_return(float %f) nounwind {
+; CHECK-LABEL: isint_float_return:
+; CHECK-NOT: xor
+; CHECK: cvt
+  %i = fptosi float %f to i32
+; CHECK-NEXT: cvt
+  %g = sitofp i32 %i to float
+; CHECK: cmpeqss
+  %c = fcmp oeq float %f, %g
+; CHECK-NOT: movd {{.*}}, %r{{.*}}
 ; CHECK-NEXT: movd
 ; CHECK-NEXT: andl
   %z = zext i1 %c to i32
@@ -17,6 +40,7 @@ define i32 @isint_return(double %d) nounwind {
 declare void @foo()
 
 define void @isint_branch(double %d) nounwind {
+; CHECK-LABEL: isint_branch:
 ; CHECK: cvt
   %i = fptosi double %d to i32
 ; CHECK-NEXT: cvt
diff --git a/test/CodeGen/X86/large-constants.ll b/test/CodeGen/X86/large-constants.ll
new file mode 100644
index 0000000..157ecc4
--- /dev/null
+++ b/test/CodeGen/X86/large-constants.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -mtriple=x86_64-darwin -mcpu=corei7 | grep movabsq | count 3
+
+define i64 @constant_hoisting(i64 %o0, i64 %o1, i64 %o2, i64 %o3, i64 %o4, i64 %o5) {
+entry:
+  %l0 = and i64 %o0, -281474976710654
+  %c0 = icmp ne i64 %l0, 0
+  br i1 %c0, label %fail, label %bb1
+
+bb1:
+  %l1 = and i64 %o1, -281474976710654
+  %c1 = icmp ne i64 %l1, 0
+  br i1 %c1, label %fail, label %bb2
+
+bb2:
+  %l2 = and i64 %o2, -281474976710654
+  %c2 = icmp ne i64 %l2, 0
+  br i1 %c2, label %fail, label %bb3
+
+bb3:
+  %l3 = and i64 %o3, -281474976710654
+  %c3 = icmp ne i64 %l3, 0
+  br i1 %c3, label %fail, label %bb4
+
+bb4:
+  %l4 = and i64 %o4, -281474976710653
+  %c4 = icmp ne i64 %l4, 0
+  br i1 %c4, label %fail, label %bb5
+
+bb5:
+  %l5 = and i64 %o5, -281474976710652
+  %c5 = icmp ne i64 %l5, 0
+  br i1 %c5, label %fail, label %bb6
+
+bb6:
+  ret i64 %l5
+
+fail:
+  ret i64 -1
+}
+
+define void @constant_expressions() {
+entry:
+  %0 = load i64* inttoptr (i64 add (i64 51250129900, i64 0) to i64*)
+  %1 = load i64* inttoptr (i64 add (i64 51250129900, i64 8) to i64*)
+  %2 = load i64* inttoptr (i64 add (i64 51250129900, i64 16) to i64*)
+  %3 = load i64* inttoptr (i64 add (i64 51250129900, i64 24) to i64*)
+  %4 = add i64 %0, %1
+  %5 = add i64 %2, %3
+  %6 = add i64 %4, %5
+  store i64 %6, i64* inttoptr (i64 add (i64 51250129900, i64 0) to i64*)
+  ret void
+}
+
+
+define void @constant_expressions2() {
+entry:
+  %0 = load i64* inttoptr (i64 51250129900 to i64*)
+  %1 = load i64* inttoptr (i64 51250129908 to i64*)
+  %2 = load i64* inttoptr (i64 51250129916 to i64*)
+  %3 = load i64* inttoptr (i64 51250129924 to i64*)
+  %4 = add i64 %0, %1
+  %5 = add i64 %2, %3
+  %6 = add i64 %4, %5
+  store i64 %6, i64* inttoptr (i64 51250129900 to i64*)
+  ret void
+}
+
diff --git a/test/CodeGen/X86/load-slice.ll b/test/CodeGen/X86/load-slice.ll
index 85fd7f0..49eb131 100644
--- a/test/CodeGen/X86/load-slice.ll
+++ b/test/CodeGen/X86/load-slice.ll
@@ -6,7 +6,7 @@
 %class.Complex = type { float, float }
 
 
-; Check that independant slices leads to independant loads then the slices leads to
+; Check that independent slices leads to independent loads then the slices leads to
 ; different register file.
 ;
 ; The layout is:
diff --git a/test/CodeGen/X86/lsr-interesting-step.ll b/test/CodeGen/X86/lsr-interesting-step.ll
index d4a7ac7..8ea3c53 100644
--- a/test/CodeGen/X86/lsr-interesting-step.ll
+++ b/test/CodeGen/X86/lsr-interesting-step.ll
@@ -3,26 +3,24 @@
 ; The inner loop should require only one add (and no leas either).
 ; rdar://8100380
 
-; CHECK:      BB0_3:
-; CHECK-NEXT:   movb    $0, flags(%rdx)
-; CHECK-NEXT:   addq    %rax, %rdx
-; CHECK-NEXT:   cmpq    $8192, %rdx
+; CHECK:      BB0_2:
+; CHECK-NEXT:   movb    $0, flags(%rcx)
+; CHECK-NEXT:   addq    %rax, %rcx
+; CHECK-NEXT:   cmpq    $8192, %rcx
 ; CHECK-NEXT:   jl
 
 @flags = external global [8192 x i8], align 16 ; <[8192 x i8]*> [#uses=1]
 
 define void @foo() nounwind {
 entry:
-  %tmp = icmp slt i64 2, 8192                     ; <i1> [#uses=1]
-  br i1 %tmp, label %bb, label %bb21
+  br label %bb
 
 bb:                                               ; preds = %entry
   br label %bb7
 
 bb7:                                              ; preds = %bb, %bb17
   %tmp8 = phi i64 [ %tmp18, %bb17 ], [ 2, %bb ]   ; <i64> [#uses=2]
-  %tmp9 = icmp slt i64 2, 8192                    ; <i1> [#uses=1]
-  br i1 %tmp9, label %bb10, label %bb17
+  br label %bb10
 
 bb10:                                             ; preds = %bb7
   br label %bb11
diff --git a/test/CodeGen/X86/lsr-normalization.ll b/test/CodeGen/X86/lsr-normalization.ll
index bbf8f01..2775558 100644
--- a/test/CodeGen/X86/lsr-normalization.ll
+++ b/test/CodeGen/X86/lsr-normalization.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -march=x86-64 | FileCheck %s
+; REQUIRES: asserts
+; RUN: llc < %s -march=x86-64 | FileCheck %s --check-prefix=ASM
+; RUN: llc -debug -o /dev/null < %s -march=x86-64 2>&1 | FileCheck %s --check-prefix=DBG
 ; rdar://8168938
 
 ; This testcase involves SCEV normalization with the exit value from
@@ -6,8 +8,9 @@
 ; loop. The expression should be properly normalized and simplified,
 ; and require only a single division.
 
-; CHECK: div
-; CHECK-NOT: div
+; DBG-NOT: DISCARDING (NORMALIZATION ISN'T INVERTIBLE)
+; ASM: div
+; ASM-NOT: div
 
 %0 = type { %0*, %0* }
 
diff --git a/test/CodeGen/X86/machine-cp.ll b/test/CodeGen/X86/machine-cp.ll
index f04e111..0006b6e 100644
--- a/test/CodeGen/X86/machine-cp.ll
+++ b/test/CodeGen/X86/machine-cp.ll
@@ -1,4 +1,4 @@
-; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-macosx -mcpu=nocona -verify-machineinstrs < %s | FileCheck %s
 
 ; After tail duplication, two copies in an early exit BB can be cancelled out.
 ; rdar://10640363
@@ -34,3 +34,27 @@ entry:
   %tmp8 = shufflevector <8 x i16> %T0, <8 x i16> %T1, <8 x i32> < i32 undef, i32 undef, i32 7, i32 2, i32 8, i32 undef, i32 undef , i32 undef >
   ret <8 x i16> %tmp8
 }
+
+define i32 @t3(i64 %a, i64 %b) nounwind  {
+entry:
+; CHECK-LABEL: t3:
+; CHECK: je [[LABEL:.*BB.*]]
+  %cmp1 = icmp eq i64 %b, 0
+  br i1 %cmp1, label %while.end, label %while.body
+
+; CHECK: [[LABEL]]:
+; CHECK-NOT: mov
+; CHECK: ret
+
+while.body:                                       ; preds = %entry, %while.body
+  %a.addr.03 = phi i64 [ %b.addr.02, %while.body ], [ %a, %entry ]
+  %b.addr.02 = phi i64 [ %rem, %while.body ], [ %b, %entry ]
+  %rem = srem i64 %a.addr.03, %b.addr.02
+  %cmp = icmp eq i64 %rem, 0
+  br i1 %cmp, label %while.end, label %while.body
+
+while.end:                                        ; preds = %while.body, %entry
+  %a.addr.0.lcssa = phi i64 [ %a, %entry ], [ %b.addr.02, %while.body ]
+  %t = trunc i64 %a.addr.0.lcssa to i32
+  ret i32 %t
+}
diff --git a/test/CodeGen/X86/mature-mc-support.ll b/test/CodeGen/X86/mature-mc-support.ll
new file mode 100644
index 0000000..9d956f4
--- /dev/null
+++ b/test/CodeGen/X86/mature-mc-support.ll
@@ -0,0 +1,18 @@
+; Test that inline assembly is parsed by the MC layer when MC support is mature
+; (even when the output is assembly).
+
+; RUN: not llc -march=x86 < %s > /dev/null 2> %t1
+; RUN: FileCheck %s < %t1
+
+; RUN: not llc -march=x86 -filetype=obj < %s > /dev/null 2> %t2
+; RUN: FileCheck %s < %t2
+
+; RUN: not llc -march=x86-64 < %s > /dev/null 2> %t3
+; RUN: FileCheck %s < %t3
+
+; RUN: not llc -march=x86-64 -filetype=obj < %s > /dev/null 2> %t4
+; RUN: FileCheck %s < %t4
+
+module asm "	.this_directive_is_very_unlikely_to_exist"
+
+; CHECK: LLVM ERROR: Error parsing inline asm
diff --git a/test/CodeGen/X86/memcmp.ll b/test/CodeGen/X86/memcmp.ll
index cb0797d..0a53492 100644
--- a/test/CodeGen/X86/memcmp.ll
+++ b/test/CodeGen/X86/memcmp.ll
@@ -22,8 +22,9 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp2:
-; CHECK: movw    ([[A0:%rdi|%rcx]]), %ax
-; CHECK: cmpw    ([[A1:%rsi|%rdx]]), %ax
+; CHECK: movzwl
+; CHECK-NEXT: movzwl
+; CHECK-NEXT: cmpl
 ; NOBUILTIN-LABEL: memcmp2:
 ; NOBUILTIN: callq
 }
@@ -41,7 +42,8 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp2a:
-; CHECK: cmpw    $28527, ([[A0]])
+; CHECK: movzwl
+; CHECK-NEXT: cmpl    $28527,
 }
 
 
@@ -58,8 +60,8 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp4:
-; CHECK: movl    ([[A0]]), %eax
-; CHECK: cmpl    ([[A1]]), %eax
+; CHECK: movl
+; CHECK-NEXT: cmpl
 }
 
 define void @memcmp4a(i8* %X, i32* nocapture %P) nounwind {
@@ -75,7 +77,7 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp4a:
-; CHECK: cmpl $1869573999, ([[A0]])
+; CHECK: cmpl $1869573999,
 }
 
 define void @memcmp8(i8* %X, i8* %Y, i32* nocapture %P) nounwind {
@@ -91,8 +93,8 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp8:
-; CHECK: movq    ([[A0]]), %rax
-; CHECK: cmpq    ([[A1]]), %rax
+; CHECK: movq
+; CHECK: cmpq
 }
 
 define void @memcmp8a(i8* %X, i32* nocapture %P) nounwind {
@@ -108,7 +110,7 @@ bb:                                               ; preds = %entry
 return:                                           ; preds = %entry
   ret void
 ; CHECK-LABEL: memcmp8a:
-; CHECK: movabsq $8029759185026510694, %rax
-; CHECK: cmpq	%rax, ([[A0]])
+; CHECK: movabsq $8029759185026510694,
+; CHECK: cmpq
 }
 
diff --git a/test/CodeGen/X86/memset-2.ll b/test/CodeGen/X86/memset-2.ll
index d0a3c7a..a87ef2e 100644
--- a/test/CodeGen/X86/memset-2.ll
+++ b/test/CodeGen/X86/memset-2.ll
@@ -5,7 +5,7 @@ declare void @llvm.memset.i32(i8*, i8, i32, i32) nounwind
 define fastcc void @t1() nounwind {
 entry:
 ; CHECK-LABEL: t1:
-; CHECK: calll _memset
+; CHECK: calll L_memset$stub
   call void @llvm.memset.p0i8.i32(i8* null, i8 0, i32 188, i32 1, i1 false)
   unreachable
 }
@@ -13,7 +13,7 @@ entry:
 define fastcc void @t2(i8 signext %c) nounwind {
 entry:
 ; CHECK-LABEL: t2:
-; CHECK: calll _memset
+; CHECK: calll L_memset$stub
   call void @llvm.memset.p0i8.i32(i8* undef, i8 %c, i32 76, i32 1, i1 false)
   unreachable
 }
diff --git a/test/CodeGen/X86/misched-aa-colored.ll b/test/CodeGen/X86/misched-aa-colored.ll
new file mode 100644
index 0000000..52a5e5d
--- /dev/null
+++ b/test/CodeGen/X86/misched-aa-colored.ll
@@ -0,0 +1,189 @@
+; RUN: llc < %s -mcpu=x86-64 -enable-misched -misched-bottomup=0 -misched-topdown=0 -misched=shuffle -enable-aa-sched-mi | FileCheck %s
+; REQUIRES: asserts
+; -misched=shuffle is NDEBUG only!
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%"class.llvm::SDNode.10.610.970.1930.2050.2290.4090" = type { %"class.llvm::FoldingSetImpl::Node.0.600.960.1920.2040.2280.4080", %"class.llvm::ilist_node.2.602.962.1922.2042.2282.4082", i16, [2 x i8], i32, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"*, %"struct.llvm::EVT.8.608.968.1928.2048.2288.4088"*, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"*, i16, i16, %"class.llvm::DebugLoc.9.609.969.1929.2049.2289.4089", i32 }
+%"class.llvm::FoldingSetImpl::Node.0.600.960.1920.2040.2280.4080" = type { i8* }
+%"class.llvm::ilist_node.2.602.962.1922.2042.2282.4082" = type { %"class.llvm::ilist_half_node.1.601.961.1921.2041.2281.4081", %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* }
+%"class.llvm::ilist_half_node.1.601.961.1921.2041.2281.4081" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* }
+%"struct.llvm::EVT.8.608.968.1928.2048.2288.4088" = type { %"class.llvm::MVT.5.605.965.1925.2045.2285.4085", %"class.llvm::Type.7.607.967.1927.2047.2287.4087"* }
+%"class.llvm::MVT.5.605.965.1925.2045.2285.4085" = type { i32 }
+%"class.llvm::SDUse.4.604.964.1924.2044.2284.4084" = type { %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"**, %"class.llvm::SDUse.4.604.964.1924.2044.2284.4084"* }
+%"class.llvm::SDValue.3.603.963.1923.2043.2283.4083" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 }
+%"class.llvm::DebugLoc.9.609.969.1929.2049.2289.4089" = type { i32, i32 }
+%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184" = type { %"class.llvm::TargetMachine.17.617.977.1937.2057.2297.4097"*, %"class.llvm::TargetSelectionDAGInfo.18.618.978.1938.2058.2298.4098"*, %"class.llvm::TargetTransformInfo.19.619.979.1939.2059.2299.4099"*, %"class.llvm::TargetLowering.51.651.1011.1971.2091.2331.4131"*, %"class.llvm::MachineFunction.52.652.1012.1972.2092.2332.4132"*, %"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086"*, i32, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090", %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", %"struct.llvm::ilist.55.655.1015.1975.2095.2335.4135", %"class.llvm::RecyclingAllocator.65.665.1025.1985.2105.2345.4145", %"class.llvm::FoldingSet.67.667.1027.1987.2107.2347.4147", %"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144", %"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144", %"class.llvm::SDDbgInfo.79.679.1039.1999.2119.2359.4159"*, i8, %"struct.llvm::SelectionDAG::DAGUpdateListener.80.680.1040.2000.2120.2360.4160"*, %"class.std::map.43.84.684.1044.2004.2124.2364.4164", %"class.llvm::FoldingSet.50.85.685.1045.2005.2125.2365.4165", %"class.std::vector.51.89.689.1049.2009.2129.2369.4169", %"class.std::vector.56.92.692.1052.2012.2132.2372.4172", %"class.std::map.61.96.696.1056.2016.2136.2376.4176", %"class.llvm::StringMap.99.699.1059.2019.2139.2379.4179", %"class.std::map.66.103.703.1063.2023.2143.2383.4183" }
+%"class.llvm::TargetMachine.17.617.977.1937.2057.2297.4097" = type { i32 (...)**, %"class.llvm::Target.11.611.971.1931.2051.2291.4091"*, %"class.std::basic_string.13.613.973.1933.2053.2293.4093", %"class.std::basic_string.13.613.973.1933.2053.2293.4093", %"class.std::basic_string.13.613.973.1933.2053.2293.4093", %"class.llvm::MCCodeGenInfo.14.614.974.1934.2054.2294.4094"*, %"class.llvm::MCAsmInfo.15.615.975.1935.2055.2295.4095"*, i8, %"class.llvm::TargetOptions.16.616.976.1936.2056.2296.4096" }
+%"class.llvm::Target.11.611.971.1931.2051.2291.4091" = type opaque
+%"class.std::basic_string.13.613.973.1933.2053.2293.4093" = type { %"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.12.612.972.1932.2052.2292.4092" }
+%"struct.std::basic_string<char, std::char_traits<char>, std::allocator<char> >::_Alloc_hider.12.612.972.1932.2052.2292.4092" = type { i8* }
+%"class.llvm::MCCodeGenInfo.14.614.974.1934.2054.2294.4094" = type opaque
+%"class.llvm::MCAsmInfo.15.615.975.1935.2055.2295.4095" = type opaque
+%"class.llvm::TargetOptions.16.616.976.1936.2056.2296.4096" = type { [2 x i8], i32, i8, %"class.std::basic_string.13.613.973.1933.2053.2293.4093", i32, i32 }
+%"class.llvm::TargetSelectionDAGInfo.18.618.978.1938.2058.2298.4098" = type opaque
+%"class.llvm::TargetTransformInfo.19.619.979.1939.2059.2299.4099" = type opaque
+%"class.llvm::TargetLowering.51.651.1011.1971.2091.2331.4131" = type { %"class.llvm::TargetLoweringBase.50.650.1010.1970.2090.2330.4130" }
+%"class.llvm::TargetLoweringBase.50.650.1010.1970.2090.2330.4130" = type { i32 (...)**, %"class.llvm::TargetMachine.17.617.977.1937.2057.2297.4097"*, %"class.llvm::DataLayout.35.635.995.1955.2075.2315.4115"*, %"class.llvm::TargetLoweringObjectFile.36.636.996.1956.2076.2316.4116"*, i8, i8, i8, i8, %"class.llvm::DenseMap.11.38.638.998.1958.2078.2318.4118", i8, i8, i8, i8, i8, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i8, i32, i32, i32, [58 x %"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119"*], [58 x i8], [58 x %"class.llvm::MVT.5.605.965.1925.2045.2285.4085"], [58 x %"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119"*], [58 x i8], [58 x %"class.llvm::MVT.5.605.965.1925.2045.2285.4085"], [58 x [188 x i8]], [58 x [4 x i8]], [58 x [58 x i8]], [58 x [5 x i8]], [24 x [4 x i32]], %"class.llvm::TargetLoweringBase::ValueTypeActionImpl.40.640.1000.1960.2080.2320.4120", %"class.std::vector.15.44.644.1004.1964.2084.2324.4124", [24 x i8], %"class.std::map.49.649.1009.1969.2089.2329.4129", [341 x i8*], [341 x i32], [341 x i32], i32, i32, i32, i32, i32, i32, i8 }
+%"class.llvm::DataLayout.35.635.995.1955.2075.2315.4115" = type { [28 x i8], i8, i32, i32, %"class.llvm::SmallVector.23.623.983.1943.2063.2303.4103", %"class.llvm::SmallVector.3.31.631.991.1951.2071.2311.4111", %"class.llvm::DenseMap.34.634.994.1954.2074.2314.4114", i8* }
+%"class.llvm::SmallVector.23.623.983.1943.2063.2303.4103" = type { [25 x i8], %"struct.llvm::SmallVectorStorage.22.622.982.1942.2062.2302.4102" }
+%"struct.llvm::SmallVectorStorage.22.622.982.1942.2062.2302.4102" = type { [7 x %"struct.llvm::AlignedCharArrayUnion.21.621.981.1941.2061.2301.4101"] }
+%"struct.llvm::AlignedCharArrayUnion.21.621.981.1941.2061.2301.4101" = type { %"struct.llvm::AlignedCharArray.20.620.980.1940.2060.2300.4100" }
+%"struct.llvm::AlignedCharArray.20.620.980.1940.2060.2300.4100" = type { [1 x i8] }
+%"class.llvm::SmallVector.3.31.631.991.1951.2071.2311.4111" = type { %"class.llvm::SmallVectorImpl.4.29.629.989.1949.2069.2309.4109", %"struct.llvm::SmallVectorStorage.9.30.630.990.1950.2070.2310.4110" }
+%"class.llvm::SmallVectorImpl.4.29.629.989.1949.2069.2309.4109" = type { %"class.llvm::SmallVectorTemplateBase.5.28.628.988.1948.2068.2308.4108" }
+%"class.llvm::SmallVectorTemplateBase.5.28.628.988.1948.2068.2308.4108" = type { %"class.llvm::SmallVectorTemplateCommon.6.27.627.987.1947.2067.2307.4107" }
+%"class.llvm::SmallVectorTemplateCommon.6.27.627.987.1947.2067.2307.4107" = type { %"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104", %"struct.llvm::AlignedCharArrayUnion.7.26.626.986.1946.2066.2306.4106" }
+%"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104" = type { i8*, i8*, i8* }
+%"struct.llvm::AlignedCharArrayUnion.7.26.626.986.1946.2066.2306.4106" = type { %"struct.llvm::AlignedCharArray.8.25.625.985.1945.2065.2305.4105" }
+%"struct.llvm::AlignedCharArray.8.25.625.985.1945.2065.2305.4105" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.9.30.630.990.1950.2070.2310.4110" = type { [15 x %"struct.llvm::AlignedCharArrayUnion.7.26.626.986.1946.2066.2306.4106"] }
+%"class.llvm::DenseMap.34.634.994.1954.2074.2314.4114" = type { %"struct.std::pair.10.33.633.993.1953.2073.2313.4113"*, i32, i32, i32 }
+%"struct.std::pair.10.33.633.993.1953.2073.2313.4113" = type { i32, %"struct.llvm::PointerAlignElem.32.632.992.1952.2072.2312.4112" }
+%"struct.llvm::PointerAlignElem.32.632.992.1952.2072.2312.4112" = type { i32, i32, i32, i32 }
+%"class.llvm::TargetLoweringObjectFile.36.636.996.1956.2076.2316.4116" = type opaque
+%"class.llvm::DenseMap.11.38.638.998.1958.2078.2318.4118" = type { %"struct.std::pair.14.37.637.997.1957.2077.2317.4117"*, i32, i32, i32 }
+%"struct.std::pair.14.37.637.997.1957.2077.2317.4117" = type { i32, i32 }
+%"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119" = type opaque
+%"class.llvm::TargetLoweringBase::ValueTypeActionImpl.40.640.1000.1960.2080.2320.4120" = type { [58 x i8] }
+%"class.std::vector.15.44.644.1004.1964.2084.2324.4124" = type { %"struct.std::_Vector_base.16.43.643.1003.1963.2083.2323.4123" }
+%"struct.std::_Vector_base.16.43.643.1003.1963.2083.2323.4123" = type { %"struct.std::_Vector_base<std::pair<llvm::MVT, const llvm::TargetRegisterClass *>, std::allocator<std::pair<llvm::MVT, const llvm::TargetRegisterClass *> > >::_Vector_impl.42.642.1002.1962.2082.2322.4122" }
+%"struct.std::_Vector_base<std::pair<llvm::MVT, const llvm::TargetRegisterClass *>, std::allocator<std::pair<llvm::MVT, const llvm::TargetRegisterClass *> > >::_Vector_impl.42.642.1002.1962.2082.2322.4122" = type { %"struct.std::pair.20.41.641.1001.1961.2081.2321.4121"*, %"struct.std::pair.20.41.641.1001.1961.2081.2321.4121"*, %"struct.std::pair.20.41.641.1001.1961.2081.2321.4121"* }
+%"struct.std::pair.20.41.641.1001.1961.2081.2321.4121" = type { %"class.llvm::MVT.5.605.965.1925.2045.2285.4085", %"class.llvm::TargetRegisterClass.39.639.999.1959.2079.2319.4119"* }
+%"class.std::map.49.649.1009.1969.2089.2329.4129" = type { %"class.std::_Rb_tree.48.648.1008.1968.2088.2328.4128" }
+%"class.std::_Rb_tree.48.648.1008.1968.2088.2328.4128" = type { %"struct.std::_Rb_tree<std::pair<unsigned int, llvm::MVT::SimpleValueType>, std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType>, std::_Select1st<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> >, std::less<std::pair<unsigned int, llvm::MVT::SimpleValueType> >, std::allocator<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> > >::_Rb_tree_impl.47.647.1007.1967.2087.2327.4127" }
+%"struct.std::_Rb_tree<std::pair<unsigned int, llvm::MVT::SimpleValueType>, std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType>, std::_Select1st<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> >, std::less<std::pair<unsigned int, llvm::MVT::SimpleValueType> >, std::allocator<std::pair<const std::pair<unsigned int, llvm::MVT::SimpleValueType>, llvm::MVT::SimpleValueType> > >::_Rb_tree_impl.47.647.1007.1967.2087.2327.4127" = type { %"struct.std::less.45.645.1005.1965.2085.2325.4125", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.std::less.45.645.1005.1965.2085.2325.4125" = type { i8 }
+%"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126" = type { i32, %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126"*, %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126"*, %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126"* }
+%"class.llvm::MachineFunction.52.652.1012.1972.2092.2332.4132" = type opaque
+%"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086" = type opaque
+%"struct.llvm::ilist.55.655.1015.1975.2095.2335.4135" = type { %"class.llvm::iplist.54.654.1014.1974.2094.2334.4134" }
+%"class.llvm::iplist.54.654.1014.1974.2094.2334.4134" = type { %"struct.llvm::ilist_traits.53.653.1013.1973.2093.2333.4133", %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* }
+%"struct.llvm::ilist_traits.53.653.1013.1973.2093.2333.4133" = type { %"class.llvm::ilist_half_node.1.601.961.1921.2041.2281.4081" }
+%"class.llvm::RecyclingAllocator.65.665.1025.1985.2105.2345.4145" = type { %"class.llvm::Recycler.59.659.1019.1979.2099.2339.4139", %"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144" }
+%"class.llvm::Recycler.59.659.1019.1979.2099.2339.4139" = type { %"class.llvm::iplist.24.58.658.1018.1978.2098.2338.4138" }
+%"class.llvm::iplist.24.58.658.1018.1978.2098.2338.4138" = type { %"struct.llvm::ilist_traits.25.57.657.1017.1977.2097.2337.4137", %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136"* }
+%"struct.llvm::ilist_traits.25.57.657.1017.1977.2097.2337.4137" = type { %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136" }
+%"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136" = type { %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136"*, %"struct.llvm::RecyclerStruct.56.656.1016.1976.2096.2336.4136"* }
+%"class.llvm::FoldingSet.67.667.1027.1987.2107.2347.4147" = type { %"class.llvm::FoldingSetImpl.66.666.1026.1986.2106.2346.4146" }
+%"class.llvm::FoldingSetImpl.66.666.1026.1986.2106.2346.4146" = type { i32 (...)**, i8**, i32, i32 }
+%"class.llvm::BumpPtrAllocator.64.664.1024.1984.2104.2344.4144" = type { i64, i64, %"class.llvm::MallocSlabAllocator.62.662.1022.1982.2102.2342.4142", %"class.llvm::SlabAllocator.60.660.1020.1980.2100.2340.4140"*, %"class.llvm::MemSlab.63.663.1023.1983.2103.2343.4143"*, i8*, i8*, i64 }
+%"class.llvm::MallocSlabAllocator.62.662.1022.1982.2102.2342.4142" = type { %"class.llvm::SlabAllocator.60.660.1020.1980.2100.2340.4140", %"class.llvm::MallocAllocator.61.661.1021.1981.2101.2341.4141" }
+%"class.llvm::SlabAllocator.60.660.1020.1980.2100.2340.4140" = type { i32 (...)** }
+%"class.llvm::MallocAllocator.61.661.1021.1981.2101.2341.4141" = type { i8 }
+%"class.llvm::MemSlab.63.663.1023.1983.2103.2343.4143" = type { i64, %"class.llvm::MemSlab.63.663.1023.1983.2103.2343.4143"* }
+%"class.llvm::SDDbgInfo.79.679.1039.1999.2119.2359.4159" = type { %"class.llvm::SmallVector.30.74.674.1034.1994.2114.2354.4154", %"class.llvm::SmallVector.30.74.674.1034.1994.2114.2354.4154", %"class.llvm::DenseMap.37.78.678.1038.1998.2118.2358.4158" }
+%"class.llvm::SmallVector.30.74.674.1034.1994.2114.2354.4154" = type { %"class.llvm::SmallVectorImpl.31.72.672.1032.1992.2112.2352.4152", %"struct.llvm::SmallVectorStorage.36.73.673.1033.1993.2113.2353.4153" }
+%"class.llvm::SmallVectorImpl.31.72.672.1032.1992.2112.2352.4152" = type { %"class.llvm::SmallVectorTemplateBase.32.71.671.1031.1991.2111.2351.4151" }
+%"class.llvm::SmallVectorTemplateBase.32.71.671.1031.1991.2111.2351.4151" = type { %"class.llvm::SmallVectorTemplateCommon.33.70.670.1030.1990.2110.2350.4150" }
+%"class.llvm::SmallVectorTemplateCommon.33.70.670.1030.1990.2110.2350.4150" = type { %"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104", %"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149" }
+%"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149" = type { %"struct.llvm::AlignedCharArray.35.68.668.1028.1988.2108.2348.4148" }
+%"struct.llvm::AlignedCharArray.35.68.668.1028.1988.2108.2348.4148" = type { [8 x i8] }
+%"struct.llvm::SmallVectorStorage.36.73.673.1033.1993.2113.2353.4153" = type { [31 x %"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149"] }
+%"class.llvm::DenseMap.37.78.678.1038.1998.2118.2358.4158" = type { %"struct.std::pair.40.77.677.1037.1997.2117.2357.4157"*, i32, i32, i32 }
+%"struct.std::pair.40.77.677.1037.1997.2117.2357.4157" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, %"class.llvm::SmallVector.41.76.676.1036.1996.2116.2356.4156" }
+%"class.llvm::SmallVector.41.76.676.1036.1996.2116.2356.4156" = type { %"class.llvm::SmallVectorImpl.31.72.672.1032.1992.2112.2352.4152", %"struct.llvm::SmallVectorStorage.42.75.675.1035.1995.2115.2355.4155" }
+%"struct.llvm::SmallVectorStorage.42.75.675.1035.1995.2115.2355.4155" = type { [1 x %"struct.llvm::AlignedCharArrayUnion.34.69.669.1029.1989.2109.2349.4149"] }
+%"struct.llvm::SelectionDAG::DAGUpdateListener.80.680.1040.2000.2120.2360.4160" = type { i32 (...)**, %"struct.llvm::SelectionDAG::DAGUpdateListener.80.680.1040.2000.2120.2360.4160"*, %"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"* }
+%"class.std::map.43.84.684.1044.2004.2124.2364.4164" = type { %"class.std::_Rb_tree.44.83.683.1043.2003.2123.2363.4163" }
+%"class.std::_Rb_tree.44.83.683.1043.2003.2123.2363.4163" = type { %"struct.std::_Rb_tree<const llvm::SDNode *, std::pair<const llvm::SDNode *const, std::basic_string<char> >, std::_Select1st<std::pair<const llvm::SDNode *const, std::basic_string<char> > >, std::less<const llvm::SDNode *>, std::allocator<std::pair<const llvm::SDNode *const, std::basic_string<char> > > >::_Rb_tree_impl.82.682.1042.2002.2122.2362.4162" }
+%"struct.std::_Rb_tree<const llvm::SDNode *, std::pair<const llvm::SDNode *const, std::basic_string<char> >, std::_Select1st<std::pair<const llvm::SDNode *const, std::basic_string<char> > >, std::less<const llvm::SDNode *>, std::allocator<std::pair<const llvm::SDNode *const, std::basic_string<char> > > >::_Rb_tree_impl.82.682.1042.2002.2122.2362.4162" = type { %"struct.std::less.48.81.681.1041.2001.2121.2361.4161", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.std::less.48.81.681.1041.2001.2121.2361.4161" = type { i8 }
+%"class.llvm::FoldingSet.50.85.685.1045.2005.2125.2365.4165" = type { %"class.llvm::FoldingSetImpl.66.666.1026.1986.2106.2346.4146" }
+%"class.std::vector.51.89.689.1049.2009.2129.2369.4169" = type { %"struct.std::_Vector_base.52.88.688.1048.2008.2128.2368.4168" }
+%"struct.std::_Vector_base.52.88.688.1048.2008.2128.2368.4168" = type { %"struct.std::_Vector_base<llvm::CondCodeSDNode *, std::allocator<llvm::CondCodeSDNode *> >::_Vector_impl.87.687.1047.2007.2127.2367.4167" }
+%"struct.std::_Vector_base<llvm::CondCodeSDNode *, std::allocator<llvm::CondCodeSDNode *> >::_Vector_impl.87.687.1047.2007.2127.2367.4167" = type { %"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166"**, %"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166"**, %"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166"** }
+%"class.llvm::CondCodeSDNode.86.686.1046.2006.2126.2366.4166" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090", i32 }
+%"class.std::vector.56.92.692.1052.2012.2132.2372.4172" = type { %"struct.std::_Vector_base.57.91.691.1051.2011.2131.2371.4171" }
+%"struct.std::_Vector_base.57.91.691.1051.2011.2131.2371.4171" = type { %"struct.std::_Vector_base<llvm::SDNode *, std::allocator<llvm::SDNode *> >::_Vector_impl.90.690.1050.2010.2130.2370.4170" }
+%"struct.std::_Vector_base<llvm::SDNode *, std::allocator<llvm::SDNode *> >::_Vector_impl.90.690.1050.2010.2130.2370.4170" = type { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"**, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"**, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"** }
+%"class.std::map.61.96.696.1056.2016.2136.2376.4176" = type { %"class.std::_Rb_tree.62.95.695.1055.2015.2135.2375.4175" }
+%"class.std::_Rb_tree.62.95.695.1055.2015.2135.2375.4175" = type { %"struct.std::_Rb_tree<llvm::EVT, std::pair<const llvm::EVT, llvm::SDNode *>, std::_Select1st<std::pair<const llvm::EVT, llvm::SDNode *> >, llvm::EVT::compareRawBits, std::allocator<std::pair<const llvm::EVT, llvm::SDNode *> > >::_Rb_tree_impl.94.694.1054.2014.2134.2374.4174" }
+%"struct.std::_Rb_tree<llvm::EVT, std::pair<const llvm::EVT, llvm::SDNode *>, std::_Select1st<std::pair<const llvm::EVT, llvm::SDNode *> >, llvm::EVT::compareRawBits, std::allocator<std::pair<const llvm::EVT, llvm::SDNode *> > >::_Rb_tree_impl.94.694.1054.2014.2134.2374.4174" = type { %"struct.llvm::EVT::compareRawBits.93.693.1053.2013.2133.2373.4173", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.llvm::EVT::compareRawBits.93.693.1053.2013.2133.2373.4173" = type { i8 }
+%"class.llvm::StringMap.99.699.1059.2019.2139.2379.4179" = type { %"class.llvm::StringMapImpl.98.698.1058.2018.2138.2378.4178", %"class.llvm::MallocAllocator.61.661.1021.1981.2101.2341.4141" }
+%"class.llvm::StringMapImpl.98.698.1058.2018.2138.2378.4178" = type { %"class.llvm::StringMapEntryBase.97.697.1057.2017.2137.2377.4177"**, i32, i32, i32, i32 }
+%"class.llvm::StringMapEntryBase.97.697.1057.2017.2137.2377.4177" = type { i32 }
+%"class.std::map.66.103.703.1063.2023.2143.2383.4183" = type { %"class.std::_Rb_tree.67.102.702.1062.2022.2142.2382.4182" }
+%"class.std::_Rb_tree.67.102.702.1062.2022.2142.2382.4182" = type { %"struct.std::_Rb_tree<std::pair<std::basic_string<char>, unsigned char>, std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *>, std::_Select1st<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> >, std::less<std::pair<std::basic_string<char>, unsigned char> >, std::allocator<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> > >::_Rb_tree_impl.101.701.1061.2021.2141.2381.4181" }
+%"struct.std::_Rb_tree<std::pair<std::basic_string<char>, unsigned char>, std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *>, std::_Select1st<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> >, std::less<std::pair<std::basic_string<char>, unsigned char> >, std::allocator<std::pair<const std::pair<std::basic_string<char>, unsigned char>, llvm::SDNode *> > >::_Rb_tree_impl.101.701.1061.2021.2141.2381.4181" = type { %"struct.std::less.71.100.700.1060.2020.2140.2380.4180", %"struct.std::_Rb_tree_node_base.46.646.1006.1966.2086.2326.4126", i64 }
+%"struct.std::less.71.100.700.1060.2020.2140.2380.4180" = type { i8 }
+%"class.llvm::Type.7.607.967.1927.2047.2287.4087" = type { %"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086"*, i32, i32, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"** }
+%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197" = type { %"class.llvm::TargetLowering.51.651.1011.1971.2091.2331.4131"*, %"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"*, %"class.llvm::TargetLoweringBase::ValueTypeActionImpl.40.640.1000.1960.2080.2320.4120", [6 x i8], %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187", %"class.llvm::SmallVector.82.116.716.1076.2036.2156.2396.4196" }
+%"class.llvm::SmallDenseMap.77.110.710.1070.2030.2150.2390.4190" = type { [4 x i8], i32, %"struct.llvm::AlignedCharArrayUnion.80.109.709.1069.2029.2149.2389.4189" }
+%"struct.llvm::AlignedCharArrayUnion.80.109.709.1069.2029.2149.2389.4189" = type { %"struct.llvm::AlignedCharArray.81.108.708.1068.2028.2148.2388.4188" }
+%"struct.llvm::AlignedCharArray.81.108.708.1068.2028.2148.2388.4188" = type { [384 x i8] }
+%"class.llvm::SmallDenseMap.107.707.1067.2027.2147.2387.4187" = type { [4 x i8], i32, %"struct.llvm::AlignedCharArrayUnion.75.106.706.1066.2026.2146.2386.4186" }
+%"struct.llvm::AlignedCharArrayUnion.75.106.706.1066.2026.2146.2386.4186" = type { %"struct.llvm::AlignedCharArray.76.105.705.1065.2025.2145.2385.4185" }
+%"struct.llvm::AlignedCharArray.76.105.705.1065.2025.2145.2385.4185" = type { [256 x i8] }
+%"class.llvm::SmallVector.82.116.716.1076.2036.2156.2396.4196" = type { %"class.llvm::SmallVectorImpl.83.114.714.1074.2034.2154.2394.4194", %"struct.llvm::SmallVectorStorage.87.115.715.1075.2035.2155.2395.4195" }
+%"class.llvm::SmallVectorImpl.83.114.714.1074.2034.2154.2394.4194" = type { %"class.llvm::SmallVectorTemplateBase.84.113.713.1073.2033.2153.2393.4193" }
+%"class.llvm::SmallVectorTemplateBase.84.113.713.1073.2033.2153.2393.4193" = type { %"class.llvm::SmallVectorTemplateCommon.85.112.712.1072.2032.2152.2392.4192" }
+%"class.llvm::SmallVectorTemplateCommon.85.112.712.1072.2032.2152.2392.4192" = type { %"class.llvm::SmallVectorBase.24.624.984.1944.2064.2304.4104", %"struct.llvm::AlignedCharArrayUnion.86.111.711.1071.2031.2151.2391.4191" }
+%"struct.llvm::AlignedCharArrayUnion.86.111.711.1071.2031.2151.2391.4191" = type { %"struct.llvm::AlignedCharArray.35.68.668.1028.1988.2108.2348.4148" }
+%"struct.llvm::SmallVectorStorage.87.115.715.1075.2035.2155.2395.4195" = type { [127 x %"struct.llvm::AlignedCharArrayUnion.86.111.711.1071.2031.2151.2391.4191"] }
+%"struct.std::pair.112.119.719.1079.2039.2159.2399.4199" = type { i32, %"struct.llvm::EVT.8.608.968.1928.2048.2288.4088" }
+%"class.llvm::DenseMapBase.73.118.718.1078.2038.2158.2398.4198" = type { i8 }
+
+@.str61 = external hidden unnamed_addr constant [80 x i8], align 1
+@.str63 = external hidden unnamed_addr constant [80 x i8], align 1
+@.str74 = external hidden unnamed_addr constant [49 x i8], align 1
+@__PRETTY_FUNCTION__._ZN4llvm16DAGTypeLegalizer16GetWidenedVectorENS_7SDValueE = external hidden unnamed_addr constant [70 x i8], align 1
+@.str98 = external hidden unnamed_addr constant [46 x i8], align 1
+@__PRETTY_FUNCTION__._ZNK4llvm6SDNode12getValueTypeEj = external hidden unnamed_addr constant [57 x i8], align 1
+@.str99 = external hidden unnamed_addr constant [19 x i8], align 1
+@__PRETTY_FUNCTION__._ZN4llvm5SDLocC2EPKNS_6SDNodeE = external hidden unnamed_addr constant [41 x i8], align 1
+@.str100 = external hidden unnamed_addr constant [50 x i8], align 1
+@__PRETTY_FUNCTION__._ZNK4llvm6SDNode10getOperandEj = external hidden unnamed_addr constant [66 x i8], align 1
+
+declare { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_(%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"*, i32, i8*, i32, i32, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"*, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8)
+
+; Function Attrs: noreturn nounwind
+declare void @__assert_fail(i8*, i8*, i32, i8*) #0
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.start(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind
+declare void @llvm.lifetime.end(i64, i8* nocapture) #1
+
+; Function Attrs: nounwind uwtable
+define hidden { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm16DAGTypeLegalizer18WidenVecRes_BinaryEPNS_6SDNodeE(%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197"* %this, %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"* %N) #2 align 2 {
+entry:
+  %Op.i43 = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
+  %ref.tmp.i = alloca %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199", align 8
+  %Op.i = alloca %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083", align 8
+  %0 = bitcast %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i to i8*
+  %retval.sroa.0.0.idx.i36 = getelementptr inbounds %"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* %ref.tmp.i, i64 0, i32 1, i32 0, i32 0
+  %retval.sroa.0.0.copyload.i37 = load i32* %retval.sroa.0.0.idx.i36, align 8
+  call void @llvm.lifetime.end(i64 24, i8* %0) #1
+  %agg.tmp8.sroa.2.0.copyload = load i32* undef, align 8
+  %1 = bitcast %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* %Op.i to i8*
+  call void @llvm.lifetime.start(i64 16, i8* %1) #1
+  %2 = getelementptr %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* %Op.i, i64 0, i32 1
+  store i32 %agg.tmp8.sroa.2.0.copyload, i32* %2, align 8
+
+; CHECK: movl	(%rax), %eax
+; CHECK-NOT: movl	%eax, {{[0-9]+}}(%rsp)
+; CHECK: movl	[[OFF:[0-9]+]](%rsp), %r8d
+; CHECK: movl	%eax, [[OFF]](%rsp)
+; CHECK: movl	$-1, %ecx
+; CHECK: callq	_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_
+
+  %call18 = call { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } @_ZN4llvm12SelectionDAG7getNodeEjNS_5SDLocENS_3EVTENS_7SDValueES3_(%"class.llvm::SelectionDAG.104.704.1064.2024.2144.2384.4184"* undef, i32 undef, i8* undef, i32 -1, i32 %retval.sroa.0.0.copyload.i37, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"* undef, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8 undef, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* byval align 8 undef) #1
+  ret { %"class.llvm::SDNode.10.610.970.1930.2050.2290.4090"*, i32 } %call18
+}
+
+; Function Attrs: nounwind uwtable
+declare hidden %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* @_ZN4llvm12DenseMapBaseINS_13SmallDenseMapINS_7SDValueES2_Lj8ENS_12DenseMapInfoIS2_EEEES2_S2_S4_EixERKS2_(%"class.llvm::DenseMapBase.73.118.718.1078.2038.2158.2398.4198"*, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"* nocapture readonly) #2 align 2
+
+declare hidden void @_ZN4llvm16DAGTypeLegalizer10RemapValueERNS_7SDValueE(%"class.llvm::DAGTypeLegalizer.117.717.1077.2037.2157.2397.4197"*, %"class.llvm::SDValue.3.603.963.1923.2043.2283.4083"*)
+
+; Function Attrs: nounwind uwtable
+declare hidden void @_ZNK4llvm18TargetLoweringBase17getTypeConversionERNS_11LLVMContextENS_3EVTE(%"struct.std::pair.112.119.719.1079.2039.2159.2399.4199"* noalias sret, %"class.llvm::TargetLoweringBase.50.650.1010.1970.2090.2330.4130"* readonly, %"class.llvm::LLVMContext.6.606.966.1926.2046.2286.4086"*, i32, %"class.llvm::Type.7.607.967.1927.2047.2287.4087"*) #2 align 2
+
+attributes #0 = { noreturn nounwind }
+attributes #1 = { nounwind }
+attributes #2 = { nounwind uwtable }
+
diff --git a/test/CodeGen/X86/misched-aa-mmos.ll b/test/CodeGen/X86/misched-aa-mmos.ll
new file mode 100644
index 0000000..343e26f
--- /dev/null
+++ b/test/CodeGen/X86/misched-aa-mmos.ll
@@ -0,0 +1,37 @@
+; RUN: llc -enable-misched -enable-aa-sched-mi < %s
+
+; This generates a decw instruction, which has two MMOs, and an alias SU edge
+; query involving that instruction. Make sure this does not crash.
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%s1 = type { i16, i16, i32 }
+%c1 = type { %s1*, %u1, i16, i8 }
+%u1 = type { i64 }
+
+declare zeroext i1 @bar(i64*, i32) #5
+
+define i32 @foo() #0 align 2 {
+entry:
+  %temp_rhs = alloca %c1, align 8
+  br i1 undef, label %if.else56, label %cond.end.i
+
+cond.end.i:
+  %significand.i18.i = getelementptr inbounds %c1* %temp_rhs, i64 0, i32 1
+  %exponent.i = getelementptr inbounds %c1* %temp_rhs, i64 0, i32 2
+  %0 = load i16* %exponent.i, align 8
+  %sub.i = add i16 %0, -1
+  store i16 %sub.i, i16* %exponent.i, align 8
+  %parts.i.i = bitcast %u1* %significand.i18.i to i64**
+  %1 = load i64** %parts.i.i, align 8
+  %call5.i = call zeroext i1 @bar(i64* %1, i32 undef) #1
+  unreachable
+
+if.else56:
+  unreachable
+}
+
+attributes #0 = { nounwind uwtable }
+attributes #1 = { nounwind }
+
diff --git a/test/CodeGen/X86/misched-matmul.ll b/test/CodeGen/X86/misched-matmul.ll
index 5454b7c..3ea6512 100644
--- a/test/CodeGen/X86/misched-matmul.ll
+++ b/test/CodeGen/X86/misched-matmul.ll
@@ -10,7 +10,7 @@
 ; more complex cases.
 ;
 ; CHECK: @wrap_mul4
-; CHECK: 23 regalloc - Number of spills inserted
+; CHECK: 22 regalloc - Number of spills inserted
 
 define void @wrap_mul4(double* nocapture %Out, [4 x double]* nocapture %A, [4 x double]* nocapture %B) #0 {
 entry:
diff --git a/test/CodeGen/X86/movbe.ll b/test/CodeGen/X86/movbe.ll
index 3f459be..e248410 100644
--- a/test/CodeGen/X86/movbe.ll
+++ b/test/CodeGen/X86/movbe.ll
@@ -1,45 +1,66 @@
 ; RUN: llc -mtriple=x86_64-linux -mcpu=atom < %s | FileCheck %s
 ; RUN: llc -mtriple=x86_64-linux -mcpu=slm < %s | FileCheck %s -check-prefix=SLM
 
+declare i16 @llvm.bswap.i16(i16) nounwind readnone
 declare i32 @llvm.bswap.i32(i32) nounwind readnone
 declare i64 @llvm.bswap.i64(i64) nounwind readnone
 
-define void @test1(i32* nocapture %x, i32 %y) nounwind {
+define void @test1(i16* nocapture %x, i16 %y) nounwind {
+  %bswap = call i16 @llvm.bswap.i16(i16 %y)
+  store i16 %bswap, i16* %x, align 2
+  ret void
+; CHECK-LABEL: test1:
+; CHECK: movbew %si, (%rdi)
+; SLM-LABEL: test1:
+; SLM: movbew   %si, (%rdi)
+}
+
+define i16 @test2(i16* %x) nounwind {
+  %load = load i16* %x, align 2
+  %bswap = call i16 @llvm.bswap.i16(i16 %load)
+  ret i16 %bswap
+; CHECK-LABEL: test2:
+; CHECK: movbew (%rdi), %ax
+; SLM-LABEL: test2:
+; SLM: movbew   (%rdi), %ax
+}
+
+define void @test3(i32* nocapture %x, i32 %y) nounwind {
   %bswap = call i32 @llvm.bswap.i32(i32 %y)
   store i32 %bswap, i32* %x, align 4
   ret void
-; CHECK-LABEL: test1:
+; CHECK-LABEL: test3:
 ; CHECK: movbel	%esi, (%rdi)
-; SLM-LABEL: test1:
+; SLM-LABEL: test3:
 ; SLM: movbel	%esi, (%rdi)
 }
 
-define i32 @test2(i32* %x) nounwind {
+define i32 @test4(i32* %x) nounwind {
   %load = load i32* %x, align 4
   %bswap = call i32 @llvm.bswap.i32(i32 %load)
   ret i32 %bswap
-; CHECK-LABEL: test2:
+; CHECK-LABEL: test4:
 ; CHECK: movbel	(%rdi), %eax
-; SLM-LABEL: test2:
+; SLM-LABEL: test4:
 ; SLM: movbel	(%rdi), %eax
 }
 
-define void @test3(i64* %x, i64 %y) nounwind {
+define void @test5(i64* %x, i64 %y) nounwind {
   %bswap = call i64 @llvm.bswap.i64(i64 %y)
   store i64 %bswap, i64* %x, align 8
   ret void
-; CHECK-LABEL: test3:
+; CHECK-LABEL: test5:
 ; CHECK: movbeq	%rsi, (%rdi)
-; SLM-LABEL: test3:
+; SLM-LABEL: test5:
 ; SLM: movbeq	%rsi, (%rdi)
 }
 
-define i64 @test4(i64* %x) nounwind {
+define i64 @test6(i64* %x) nounwind {
   %load = load i64* %x, align 8
   %bswap = call i64 @llvm.bswap.i64(i64 %load)
   ret i64 %bswap
-; CHECK-LABEL: test4:
+; CHECK-LABEL: test6:
 ; CHECK: movbeq	(%rdi), %rax
-; SLM-LABEL: test4:
+; SLM-LABEL: test6:
 ; SLM: movbeq	(%rdi), %rax
 }
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
index 5e7ba37..6910515 100644
--- a/test/CodeGen/X86/ms-inline-asm.ll
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -1,11 +1,10 @@
-; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core2 -no-integrated-as | FileCheck %s
 
 define i32 @t1() nounwind {
 entry:
   %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0A\09mov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
   ret i32 %0
 ; CHECK: t1
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, ecx
@@ -19,7 +18,6 @@ entry:
   call void asm sideeffect inteldialect "mov eax, $$1", "~{eax},~{dirflag},~{fpsr},~{flags}"() nounwind
   ret void
 ; CHECK: t2
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, 1
@@ -34,7 +32,6 @@ entry:
   call void asm sideeffect inteldialect "mov eax, DWORD PTR [$0]", "*m,~{eax},~{dirflag},~{fpsr},~{flags}"(i32* %V.addr) nounwind
   ret void
 ; CHECK: t3
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: mov eax, DWORD PTR {{[[esp]}}
@@ -56,7 +53,6 @@ entry:
   %0 = load i32* %b1, align 4
   ret i32 %0
 ; CHECK: t18
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: lea ebx, foo
@@ -76,7 +72,6 @@ entry:
   call void asm sideeffect inteldialect "call $0", "r,~{dirflag},~{fpsr},~{flags}"(void ()* @t19_helper) nounwind
   ret void
 ; CHECK-LABEL: t19:
-; CHECK: movl %esp, %ebp
 ; CHECK: movl ${{_?}}t19_helper, %eax
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
@@ -95,7 +90,6 @@ entry:
   %0 = load i32** %res, align 4
   ret i32* %0
 ; CHECK-LABEL: t30:
-; CHECK: movl %esp, %ebp
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
 ; CHECK: lea edi, dword ptr [{{_?}}results]
@@ -103,8 +97,31 @@ entry:
 ; CHECK: {{## InlineAsm End|#NO_APP}}
 ; CHECK: {{## InlineAsm Start|#APP}}
 ; CHECK: .intel_syntax
-; CHECK: mov dword ptr [esi], edi
+; CHECK: mov dword ptr [esp], edi
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+; CHECK: movl (%esp), %eax
+}
+
+; Stack realignment plus MS inline asm that does *not* adjust the stack is no
+; longer an error.
+
+define i32 @t31() {
+entry:
+  %val = alloca i32, align 64
+  store i32 -1, i32* %val, align 64
+  call void asm sideeffect inteldialect "mov dword ptr $0, esp", "=*m,~{dirflag},~{fpsr},~{flags}"(i32* %val) #1
+  %sp = load i32* %val, align 64
+  ret i32 %sp
+; CHECK-LABEL: t31:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: andl $-64, %esp
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov dword ptr [esp], esp
 ; CHECK: .att_syntax
 ; CHECK: {{## InlineAsm End|#NO_APP}}
-; CHECK: movl (%esi), %eax
+; CHECK: movl (%esp), %eax
+; CHECK: ret
 }
diff --git a/test/CodeGen/X86/mul128_sext_loop.ll b/test/CodeGen/X86/mul128_sext_loop.ll
new file mode 100644
index 0000000..a516f03
--- /dev/null
+++ b/test/CodeGen/X86/mul128_sext_loop.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -march=x86-64 | FileCheck %s
+
+define void @test(i64* nocapture %arr, i64 %arrsize, i64 %factor) nounwind uwtable {
+  %1 = icmp sgt i64 %arrsize, 0
+  br i1 %1, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %2 = sext i64 %factor to i128
+  br label %3
+
+; <label>:3                                       ; preds = %3, %.lr.ph
+; CHECK-NOT: mul
+; CHECK: imulq
+; CHECK-NOT: mul
+  %carry.02 = phi i128 [ 0, %.lr.ph ], [ %10, %3 ]
+  %i.01 = phi i64 [ 0, %.lr.ph ], [ %11, %3 ]
+  %4 = getelementptr inbounds i64* %arr, i64 %i.01
+  %5 = load i64* %4, align 8
+  %6 = sext i64 %5 to i128
+  %7 = mul nsw i128 %6, %2
+  %8 = add nsw i128 %7, %carry.02
+  %.tr = trunc i128 %8 to i64
+  %9 = and i64 %.tr, 9223372036854775807
+  store i64 %9, i64* %4, align 8
+  %10 = ashr i128 %8, 63
+  %11 = add nsw i64 %i.01, 1
+  %exitcond = icmp eq i64 %11, %arrsize
+  br i1 %exitcond, label %._crit_edge, label %3
+
+._crit_edge:                                      ; preds = %3, %0
+  ret void
+}
diff --git a/test/CodeGen/X86/mult-alt-generic-i686.ll b/test/CodeGen/X86/mult-alt-generic-i686.ll
index 7c3499f..54bc3a4 100644
--- a/test/CodeGen/X86/mult-alt-generic-i686.ll
+++ b/test/CodeGen/X86/mult-alt-generic-i686.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86
+; RUN: llc < %s -march=x86 -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 target triple = "i686"
diff --git a/test/CodeGen/X86/mult-alt-generic-x86_64.ll b/test/CodeGen/X86/mult-alt-generic-x86_64.ll
index f35bb5e..84a9c81 100644
--- a/test/CodeGen/X86/mult-alt-generic-x86_64.ll
+++ b/test/CodeGen/X86/mult-alt-generic-x86_64.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64
+; RUN: llc < %s -march=x86-64 -no-integrated-as
 ; ModuleID = 'mult-alt-generic.c'
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64"
diff --git a/test/CodeGen/X86/mult-alt-x86.ll b/test/CodeGen/X86/mult-alt-x86.ll
index 06175da..cb2219a 100644
--- a/test/CodeGen/X86/mult-alt-x86.ll
+++ b/test/CodeGen/X86/mult-alt-x86.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2
+; RUN: llc < %s -march=x86 -mattr=+sse2 -no-integrated-as
 ; ModuleID = 'mult-alt-x86.c'
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-f80:128:128-v64:64:64-v128:128:128-a0:0:64-f80:32:32-n8:16:32"
 target triple = "i686-pc-win32"
diff --git a/test/CodeGen/X86/multiple-loop-post-inc.ll b/test/CodeGen/X86/multiple-loop-post-inc.ll
index 29b9f34..4edc1ff 100644
--- a/test/CodeGen/X86/multiple-loop-post-inc.ll
+++ b/test/CodeGen/X86/multiple-loop-post-inc.ll
@@ -1,4 +1,4 @@
-; RUN: llc -asm-verbose=false -disable-branch-fold -disable-block-placement -disable-tail-duplicate -march=x86-64 -mcpu=nehalem < %s | FileCheck %s
+; RUN: llc -asm-verbose=false -disable-branch-fold -disable-block-placement -disable-tail-duplicate -march=x86-64 -mcpu=nehalem -no-integrated-as < %s | FileCheck %s
 ; rdar://7236213
 ;
 ; The scheduler's 2-address hack has been disabled, so there is
diff --git a/test/CodeGen/X86/negate-add-zero.ll b/test/CodeGen/X86/negate-add-zero.ll
index 92850f2..c961bd0 100644
--- a/test/CodeGen/X86/negate-add-zero.ll
+++ b/test/CodeGen/X86/negate-add-zero.ll
@@ -827,9 +827,7 @@ declare void @_ZN11MatrixTools9transposeI11FixedMatrixIdLi6ELi6ELi0ELi0EEEENT_13
 declare void @_ZN21HNodeTranslateRotate311toCartesianEv(%struct.HNodeTranslateRotate3*)
 
 define linkonce void @_ZN21HNodeTranslateRotate36setVelERK9CDSVectorIdLi1EN3CDS12DefaultAllocEE(%struct.HNodeTranslateRotate3* %this, %"struct.CDSVector<double,0,CDS::DefaultAlloc>"* %velv) {
-entry:
-	%0 = add i32 0, -1		; <i32> [#uses=1]
-	%1 = getelementptr double* null, i32 %0		; <double*> [#uses=1]
+	%1 = getelementptr double* null, i32 -1		; <double*> [#uses=1]
 	%2 = load double* %1, align 8		; <double> [#uses=1]
 	%3 = load double* null, align 8		; <double> [#uses=2]
 	%4 = load double* null, align 8		; <double> [#uses=2]
@@ -890,13 +888,12 @@ entry:
 	store double %52, double* %55, align 8
 	%56 = getelementptr %struct.HNodeTranslateRotate3* %this, i32 0, i32 0, i32 10, i32 0, i32 0, i32 2		; <double*> [#uses=1]
 	store double %53, double* %56, align 8
-	%57 = add i32 0, 4		; <i32> [#uses=1]
-	%58 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 0		; <%"struct.CDSVector<double,0,CDS::DefaultAlloc>"**> [#uses=1]
-	store %"struct.CDSVector<double,0,CDS::DefaultAlloc>"* %velv, %"struct.CDSVector<double,0,CDS::DefaultAlloc>"** %58, align 8
-	%59 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 1		; <i32*> [#uses=1]
-	store i32 %57, i32* %59, align 4
-	%60 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 2		; <i32*> [#uses=1]
-	store i32 3, i32* %60, align 8
+	%57 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 0		; <%"struct.CDSVector<double,0,CDS::DefaultAlloc>"**> [#uses=1]
+	store %"struct.CDSVector<double,0,CDS::DefaultAlloc>"* %velv, %"struct.CDSVector<double,0,CDS::DefaultAlloc>"** %57, align 8
+	%58 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 1		; <i32*> [#uses=1]
+	store i32 4, i32* %58, align 4
+	%59 = getelementptr %"struct.SubVector<CDSVector<double, 1, CDS::DefaultAlloc> >"* null, i32 0, i32 2		; <i32*> [#uses=1]
+	store i32 3, i32* %59, align 8
 	unreachable
 }
 
diff --git a/test/CodeGen/X86/no-elf-compact-unwind.ll b/test/CodeGen/X86/no-elf-compact-unwind.ll
deleted file mode 100644
index 8a15817..0000000
--- a/test/CodeGen/X86/no-elf-compact-unwind.ll
+++ /dev/null
@@ -1,48 +0,0 @@
-; RUN: llc < %s -mtriple x86_64-apple-macosx10.8.0 -disable-cfi | FileCheck -check-prefix=MACHO %s
-; RUN: llc < %s -mtriple x86_64-unknown-linux -disable-cfi | FileCheck -check-prefix=ELF %s
-
-; Make sure we don't generate a compact unwind for ELF.
-
-; MACHO-LABEL: _Z3barv:
-; MACHO:       __compact_unwind
-
-; ELF-LABEL:   _Z3barv:
-; ELF-NOT:     __compact_unwind
-
-@_ZTIi = external constant i8*
-
-define void @_Z3barv() uwtable {
-entry:
-  invoke void @_Z3foov()
-          to label %try.cont unwind label %lpad
-
-lpad:                                             ; preds = %entry
-  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
-          catch i8* bitcast (i8** @_ZTIi to i8*)
-  %1 = extractvalue { i8*, i32 } %0, 1
-  %2 = tail call i32 @llvm.eh.typeid.for(i8* bitcast (i8** @_ZTIi to i8*))
-  %matches = icmp eq i32 %1, %2
-  br i1 %matches, label %catch, label %eh.resume
-
-catch:                                            ; preds = %lpad
-  %3 = extractvalue { i8*, i32 } %0, 0
-  %4 = tail call i8* @__cxa_begin_catch(i8* %3)
-  tail call void @__cxa_end_catch()
-  br label %try.cont
-
-try.cont:                                         ; preds = %entry, %catch
-  ret void
-
-eh.resume:                                        ; preds = %lpad
-  resume { i8*, i32 } %0
-}
-
-declare void @_Z3foov()
-
-declare i32 @__gxx_personality_v0(...)
-
-declare i32 @llvm.eh.typeid.for(i8*)
-
-declare i8* @__cxa_begin_catch(i8*)
-
-declare void @__cxa_end_catch()
diff --git a/test/CodeGen/X86/nocx16.ll b/test/CodeGen/X86/nocx16.ll
index cceaac4..8b995da 100644
--- a/test/CodeGen/X86/nocx16.ll
+++ b/test/CodeGen/X86/nocx16.ll
@@ -2,7 +2,7 @@
 define void @test(i128* %a) nounwind {
 entry:
 ; CHECK: __sync_val_compare_and_swap_16
-  %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst
+  %0 = cmpxchg i128* %a, i128 1, i128 1 seq_cst seq_cst
 ; CHECK: __sync_lock_test_and_set_16
   %1 = atomicrmw xchg i128* %a, i128 1 seq_cst
 ; CHECK: __sync_fetch_and_add_16
diff --git a/test/CodeGen/X86/opaque-constant-asm.ll b/test/CodeGen/X86/opaque-constant-asm.ll
new file mode 100644
index 0000000..dd1cc8e
--- /dev/null
+++ b/test/CodeGen/X86/opaque-constant-asm.ll
@@ -0,0 +1,13 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -no-integrated-as | FileCheck %s
+; This tests makes sure that we not mistake the bitcast inside the asm statement
+; as an opaque constant. If we do, then the compilation will simply fail.
+
+%struct2 = type <{ i32, i32, i32, i32 }>
+%union.anon = type { [2 x i64], [4 x i32] }
+%struct1 = type { i32, %union.anon }
+
+define void @test() {
+; CHECK: #ASM $16
+  call void asm sideeffect "#ASM $0", "n"(i32 ptrtoint (i32* getelementptr inbounds (%struct2* bitcast (%union.anon* getelementptr inbounds (%struct1* null, i32 0, i32 1) to %struct2*), i32 0, i32 2) to i32))
+  ret void
+}
diff --git a/test/CodeGen/X86/osx-private-labels.ll b/test/CodeGen/X86/osx-private-labels.ll
new file mode 100644
index 0000000..349ce7d
--- /dev/null
+++ b/test/CodeGen/X86/osx-private-labels.ll
@@ -0,0 +1,71 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+; Test all the cases where a L label is safe. Removing any entry from
+; TargetLoweringObjectFileMachO::isSectionAtomizableBySymbols should cause
+; this to fail.
+; We also test some noteworthy cases that require an l label.
+
+@private1 = private unnamed_addr constant [4 x i8] c"zed\00"
+; CHECK: .section	__TEXT,__cstring,cstring_literals
+; CHECK-NEXT: L_private1:
+
+@private2 = private unnamed_addr constant [5 x i16] [i16 116, i16 101,
+                                                     i16 115, i16 116, i16 0]
+; CHECK: .section	__TEXT,__ustring
+; CHECK-NEXT: .align	1
+; CHECK-NEXT: l_private2:
+
+; There is no dedicated 4 byte strings on MachO.
+
+%struct.NSConstantString = type { i32*, i32, i8*, i32 }
+@private3 = private constant %struct.NSConstantString { i32* null, i32 1992, i8* null, i32 0 }, section "__DATA,__cfstring"
+; CHECK: .section	__DATA,__cfstring
+; CHECK-NEXT: .align	4
+; CHECK-NEXT: L_private3:
+
+; There is no dedicated 1 or 2 byte constant section on MachO.
+
+@private4 = private unnamed_addr constant i32 42
+; CHECK: .section	__TEXT,__literal4,4byte_literals
+; CHECK-NEXT: .align	2
+; CHECK-NEXT: L_private4:
+
+@private5 = private unnamed_addr constant i64 42
+; CHECK: .section	__TEXT,__literal8,8byte_literals
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private5:
+
+@private6 = private unnamed_addr constant i128 42
+; CHECK: .section	__TEXT,__literal16,16byte_literals
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private6:
+
+%struct._objc_class = type { i8* }
+@private7 = private global %struct._objc_class* null, section "__OBJC,__cls_refs,literal_pointers,no_dead_strip"
+; CHECK: .section	__OBJC,__cls_refs,literal_pointers,no_dead_strip
+; CHECK: .align	3
+; CHECK: L_private7:
+
+@private8 = private global i32* null, section "__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers"
+; CHECK: .section	__DATA,__nl_symbol_ptr,non_lazy_symbol_pointers
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private8:
+
+@private9 = private global i32* null, section "__DATA,__la_symbol_ptr,lazy_symbol_pointers"
+; CHECK: .section	__DATA,__la_symbol_ptr,lazy_symbol_pointers
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private9:
+
+@private10 = private global i32* null, section "__DATA,__mod_init_func,mod_init_funcs"
+; CHECK: .section	__DATA,__mod_init_func,mod_init_funcs
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private10:
+
+@private11 = private global i32* null, section "__DATA,__mod_term_func,mod_term_funcs"
+; CHECK: .section	__DATA,__mod_term_func,mod_term_funcs
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private11:
+
+@private12 = private global i32* null, section "__DATA,__foobar,interposing"
+; CHECK: .section	__DATA,__foobar,interposing
+; CHECK-NEXT: .align	3
+; CHECK-NEXT: L_private12:
diff --git a/test/CodeGen/X86/patchpoint.ll b/test/CodeGen/X86/patchpoint.ll
index d534639..62b1273 100644
--- a/test/CodeGen/X86/patchpoint.ll
+++ b/test/CodeGen/X86/patchpoint.ll
@@ -7,16 +7,16 @@ entry:
 ; CHECK-LABEL: trivial_patchpoint_codegen:
 ; CHECK:      movabsq $-559038736, %r11
 ; CHECK-NEXT: callq *%r11
-; CHECK-NEXT: nop
+; CHECK-NEXT: xchgw %ax, %ax
 ; CHECK:      movq %rax, %[[REG:r.+]]
 ; CHECK:      callq *%r11
-; CHECK-NEXT: nop
+; CHECK-NEXT: xchgw %ax, %ax
 ; CHECK:      movq %[[REG]], %rax
 ; CHECK:      ret
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 2, i32 15, i8* %resolveCall2, i32 4, i64 %p1, i64 %p2, i64 %p3, i64 %p4)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 3, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
   ret i64 %result
 }
 
@@ -34,31 +34,65 @@ entry:
   store i64 11, i64* %metadata
   store i64 12, i64* %metadata
   store i64 13, i64* %metadata
-  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 4, i32 0, i64* %metadata)
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 0, i64* %metadata)
   ret void
 }
 
 ; Test the webkit_jscc calling convention.
-; Two arguments will be pushed on the stack.
+; One argument will be passed in register, the other will be pushed on the stack.
 ; Return value in $rax.
 define void @jscall_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: jscall_patchpoint_codegen:
 ; CHECK:      Ltmp
-; CHECK:      movq %r{{.+}}, 8(%rsp)
 ; CHECK:      movq %r{{.+}}, (%rsp)
+; CHECK:      movq %r{{.+}}, %rax
 ; CHECK:      Ltmp
 ; CHECK-NEXT: movabsq $-559038736, %r11
 ; CHECK-NEXT: callq *%r11
-; CHECK:      movq %rax, 8(%rsp)
+; CHECK:      movq %rax, (%rsp)
 ; CHECK:      callq
   %resolveCall2 = inttoptr i64 -559038736 to i8*
-  %result = tail call webkit_jscc i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveCall2, i32 2, i64 %p1, i64 %p2)
+  %result = tail call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveCall2, i32 2, i64 %p4, i64 %p2)
   %resolveCall3 = inttoptr i64 -559038737 to i8*
-  tail call webkit_jscc void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveCall3, i32 2, i64 %p1, i64 %result)
+  tail call webkit_jscc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveCall3, i32 2, i64 %p4, i64 %result)
   ret void
 }
 
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen2(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen2:
+; CHECK:      Ltmp
+; CHECK:      movq $6, 24(%rsp)
+; CHECK-NEXT: movl $4, 16(%rsp)
+; CHECK-NEXT: movq $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 6, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6)
+  ret i64 %result
+}
+
+; Test if the arguments are properly aligned and that we don't store undef arguments.
+define i64 @jscall_patchpoint_codegen3(i64 %callee) {
+entry:
+; CHECK-LABEL: jscall_patchpoint_codegen3:
+; CHECK:      Ltmp
+; CHECK:      movq $10, 48(%rsp)
+; CHECK-NEXT: movl  $8, 36(%rsp)
+; CHECK-NEXT: movq  $6, 24(%rsp)
+; CHECK-NEXT: movl  $4, 16(%rsp)
+; CHECK-NEXT: movq  $2, (%rsp)
+; CHECK:      Ltmp
+; CHECK-NEXT: movabsq $-559038736, %r11
+; CHECK-NEXT: callq *%r11
+  %call = inttoptr i64 -559038736 to i8*
+  %result = call webkit_jscc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 7, i32 15, i8* %call, i32 10, i64 %callee, i64 2, i64 undef, i32 4, i32 undef, i64 6, i32 undef, i32 8, i32 undef, i64 10)
+  ret i64 %result
+}
+
 ; Test patchpoints reusing the same TargetConstant.
 ; <rdar:15390785> Assertion failed: (CI.getNumArgOperands() >= NumArgs + 4)
 ; There is no way to verify this, since it depends on memory allocation.
@@ -68,14 +102,14 @@ entry:
   %tmp80 = add i64 %tmp79, -16
   %tmp81 = inttoptr i64 %tmp80 to i64*
   %tmp82 = load i64* %tmp81, align 8
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i64 %arg, i64 %tmp2, i64 %tmp10, i64 %tmp82)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 15, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp82)
   %tmp83 = load i64* %tmp33, align 8
   %tmp84 = add i64 %tmp83, -24
   %tmp85 = inttoptr i64 %tmp84 to i64*
   %tmp86 = load i64* %tmp85, align 8
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 5, i64 %arg, i64 %tmp10, i64 %tmp86)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 18, i32 30, i8* null, i32 3, i64 %arg, i64 %tmp10, i64 %tmp86)
   ret i64 10
 }
 
@@ -84,17 +118,13 @@ define void @small_patchpoint_codegen(i64 %p1, i64 %p2, i64 %p3, i64 %p4) {
 entry:
 ; CHECK-LABEL: small_patchpoint_codegen:
 ; CHECK:      Ltmp
-; CHECK:      nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
-; CHECK-NEXT: nop
+; CHECK:      nopl 8(%rax,%rax)
 ; CHECK-NEXT: popq
 ; CHECK-NEXT: ret
-  %result = tail call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
+  %result = tail call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 5, i8* null, i32 2, i64 %p1, i64 %p2)
   ret void
 }
 
-declare void @llvm.experimental.stackmap(i32, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/peephole-multiple-folds.ll b/test/CodeGen/X86/peephole-multiple-folds.ll
new file mode 100644
index 0000000..d184569
--- /dev/null
+++ b/test/CodeGen/X86/peephole-multiple-folds.ll
@@ -0,0 +1,29 @@
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s
+;
+; Test multiple peephole-time folds in a single basic block.
+; <rdar://problem/16478629>
+
+define <8 x float> @test_peephole_multi_fold(<8 x float>* %p1, <8 x float>* %p2) {
+entry:
+  br label %loopbody
+
+loopbody:
+; CHECK: test_peephole_multi_fold:
+; CHECK: vfmadd231ps (%rdi),
+; CHECK: vfmadd231ps (%rsi),
+  %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ]
+  %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ]
+  %m1 = load <8 x float>* %p1, align 1
+  %m2 = load <8 x float>* %p2, align 1
+  %vsum1.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m1, <8 x float> zeroinitializer, <8 x float> %vsum1)
+  %vsum2.next = tail call <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float> %m2, <8 x float> zeroinitializer, <8 x float> %vsum2)
+  %vsum1.next.1 = extractelement <8 x float> %vsum1.next, i32 0
+  %c = fcmp oeq float %vsum1.next.1, 0.0
+  br i1 %c, label %loopbody, label %loopexit
+
+loopexit:
+  %r = fadd <8 x float> %vsum1.next, %vsum2.next
+  ret <8 x float> %r
+}
+
+declare <8 x float> @llvm.x86.fma.vfmadd.ps.256(<8 x float>, <8 x float>, <8 x float>)
diff --git a/test/CodeGen/X86/personality.ll b/test/CodeGen/X86/personality.ll
index 51be7bc..424a307 100644
--- a/test/CodeGen/X86/personality.ll
+++ b/test/CodeGen/X86/personality.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -disable-cfi -mtriple=x86_64-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -disable-cfi -mtriple=i386-apple-darwin9 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i386-apple-darwin9 | FileCheck %s -check-prefix=X32
 ; PR1632
 
 define void @_Z1fv() {
@@ -41,15 +41,10 @@ declare void @__cxa_end_catch()
 
 declare i32 @__gxx_personality_v0(...)
 
-; X64:      zPLR
-; X64:      .byte 155
-; X64-NEXT: .long	___gxx_personality_v0@GOTPCREL+4
+; X64: .cfi_personality 155, ___gxx_personality_v0
+
+; X32: .cfi_personality 155, L___gxx_personality_v0$non_lazy_ptr
 
 ; X32:        .section	__IMPORT,__pointers,non_lazy_symbol_pointers
 ; X32-NEXT: L___gxx_personality_v0$non_lazy_ptr:
 ; X32-NEXT:   .indirect_symbol ___gxx_personality_v0
-
-; X32:      zPLR
-; X32:      .byte 155
-; X32-NEXT: :
-; X32-NEXT: .long	L___gxx_personality_v0$non_lazy_ptr-
diff --git a/test/CodeGen/X86/personality_size.ll b/test/CodeGen/X86/personality_size.ll
index 30a5d39..79d131b 100644
--- a/test/CodeGen/X86/personality_size.ll
+++ b/test/CodeGen/X86/personality_size.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -relocation-model=pic -disable-cfi -mtriple=x86_64-pc-solaris2.11 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -relocation-model=pic -disable-cfi -mtriple=i386-pc-solaris2.11 -disable-cgp-branch-opts | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -relocation-model=pic -mtriple=x86_64-pc-solaris2.11 | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -relocation-model=pic -mtriple=i386-pc-solaris2.11 | FileCheck %s -check-prefix=X32
 ; PR1632
 
 define void @_Z1fv() {
diff --git a/test/CodeGen/X86/pic.ll b/test/CodeGen/X86/pic.ll
index 7bb127e..da1e224 100644
--- a/test/CodeGen/X86/pic.ll
+++ b/test/CodeGen/X86/pic.ll
@@ -192,7 +192,8 @@ bb12:
 ; LINUX:   .LJTI7_0@GOTOFF(
 ; LINUX:   jmpl	*
 
-; LINUX: .LJTI7_0:
+; LINUX: .align 4
+; LINUX-NEXT: .LJTI7_0:
 ; LINUX:   .long	 .LBB7_2@GOTOFF
 ; LINUX:   .long	 .LBB7_8@GOTOFF
 ; LINUX:   .long	 .LBB7_14@GOTOFF
diff --git a/test/CodeGen/X86/pr10420.ll b/test/CodeGen/X86/pr10420.ll
deleted file mode 100644
index 3993f24..0000000
--- a/test/CodeGen/X86/pr10420.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-apple-macosx -disable-cfi | FileCheck %s
-
-define private void @foo() {
-       ret void
-}
-
-define void @bar() {
-       call void @foo()
-       ret void;
-}
-
-; CHECK: _bar:                                   ## @bar
-; CHECK-NEXT: Ltmp2:
-
-; CHECK: Ltmp12:
-; CHECK-NEXT: Ltmp13 = L_foo-Ltmp12                   ## FDE initial location
-; CHECK-NEXT:         .quad   Ltmp13
-
-; CHECK: Ltmp19:
-; CHECK-NEXT: Ltmp20 = Ltmp2-Ltmp19                   ## FDE initial location
-; CHECK-NEXT:         .quad   Ltmp20
diff --git a/test/CodeGen/X86/pr14090.ll b/test/CodeGen/X86/pr14090.ll
deleted file mode 100644
index 2f7c720..0000000
--- a/test/CodeGen/X86/pr14090.ll
+++ /dev/null
@@ -1,70 +0,0 @@
-; RUN: llc < %s -march=x86-64 -print-before=stack-coloring -print-after=stack-coloring >%t 2>&1 && FileCheck <%t %s
-
-define void @foo(i64* %retval.i, i32 %call, i32* %.ph.i80, i32 %fourteen, i32* %out.lo, i32* %out.hi) nounwind align 2 {
-entry:
-  %_Tmp.i39 = alloca i64, align 8
-  %retval.i33 = alloca i64, align 8
-  %_Tmp.i = alloca i64, align 8
-  %retval.i.i = alloca i64, align 8
-  %_First.i = alloca i64, align 8
-
-  %0 = load i64* %retval.i, align 8
-
-  %1 = load i64* %retval.i, align 8
-
-  %_Tmp.i39.0.cast73 = bitcast i64* %_Tmp.i39 to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %_Tmp.i39.0.cast73)
-  store i64 %1, i64* %_Tmp.i39, align 8
-  %cmp.i.i.i40 = icmp slt i32 %call, 0
-  %2 = lshr i64 %1, 32
-  %3 = trunc i64 %2 to i32
-  %sub.i.i.i44 = sub i32 0, %call
-  %cmp2.i.i.i45 = icmp ult i32 %3, %sub.i.i.i44
-  %or.cond.i.i.i46 = and i1 %cmp.i.i.i40, %cmp2.i.i.i45
-  %add.i.i.i47 = add i32 %3, %call
-  %sub5.i.i.i48 = lshr i32 %add.i.i.i47, 5
-  %trunc.i50 = trunc i64 %1 to i32
-  %inttoptr.i51 = inttoptr i32 %trunc.i50 to i32*
-  %add61617.i.i.i52 = or i32 %sub5.i.i.i48, -134217728
-  %add61617.i.sub5.i.i.i53 = select i1 %or.cond.i.i.i46, i32 %add61617.i.i.i52, i32 %sub5.i.i.i48
-  %storemerge2.i.i54 = getelementptr inbounds i32* %inttoptr.i51, i32 %add61617.i.sub5.i.i.i53
-  %_Tmp.i39.0.cast74 = bitcast i64* %_Tmp.i39 to i32**
-  store i32* %storemerge2.i.i54, i32** %_Tmp.i39.0.cast74, align 8
-  %storemerge.i.i55 = and i32 %add.i.i.i47, 31
-  %_Tmp.i39.4.raw_idx = getelementptr inbounds i8* %_Tmp.i39.0.cast73, i32 4
-  %_Tmp.i39.4.cast = bitcast i8* %_Tmp.i39.4.raw_idx to i32*
-  store i32 %storemerge.i.i55, i32* %_Tmp.i39.4.cast, align 4
-  %srcval.i56 = load i64* %_Tmp.i39, align 8
-  call void @llvm.lifetime.end(i64 8, i8* %_Tmp.i39.0.cast73)
-
-; CHECK: Before Merge disjoint stack slots
-; CHECK: [[PREFIX15:MOV64mr.*<fi#]]{{[0-9]}}[[SUFFIX15:.*;]] mem:ST8[%fifteen]
-; CHECK: [[PREFIX87:MOV32mr.*;]] mem:ST4[%sunkaddr87]
-
-; CHECK: After Merge disjoint stack slots
-; CHECK: [[PREFIX15]]{{[0-9]}}[[SUFFIX15]] mem:ST8[%_Tmp.i39]
-; CHECK: [[PREFIX87]] mem:ST4[<unknown>]
-
-  %fifteen = bitcast i64* %retval.i.i to i32**
-  %sixteen = bitcast i64* %retval.i.i to i8*
-  call void @llvm.lifetime.start(i64 8, i8* %sixteen)
-  store i32* %.ph.i80, i32** %fifteen, align 8
-  %sunkaddr = ptrtoint i64* %retval.i.i to i32
-  %sunkaddr86 = add i32 %sunkaddr, 4
-  %sunkaddr87 = inttoptr i32 %sunkaddr86 to i32*
-  store i32 %fourteen, i32* %sunkaddr87, align 4
-  %seventeen = load i64* %retval.i.i, align 8
-  call void @llvm.lifetime.end(i64 8, i8* %sixteen)
-  %eighteen = lshr i64 %seventeen, 32
-  %nineteen = trunc i64 %eighteen to i32
-  %shl.i.i.i = shl i32 1, %nineteen
-
-  store i32 %shl.i.i.i, i32* %out.lo, align 8
-  store i32 %nineteen, i32* %out.hi, align 8
-
-  ret void
-}
-
-declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
-
-declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
diff --git a/test/CodeGen/X86/pr1462.ll b/test/CodeGen/X86/pr1462.ll
index 62549a5..3aa1860 100644
--- a/test/CodeGen/X86/pr1462.ll
+++ b/test/CodeGen/X86/pr1462.ll
@@ -1,8 +1,7 @@
 ; RUN: llc < %s
 ; PR1462
 
-target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-
-v64:64:64-v128:128:128-a0:0:64"
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64"
 target triple = "x86_64-unknown-linux-gnu"
 
 define hidden i128 @__addvti3(i128 %a1, i128 %b2) {
diff --git a/test/CodeGen/X86/pr16031.ll b/test/CodeGen/X86/pr16031.ll
index ecf6218..dc16fd9 100644
--- a/test/CodeGen/X86/pr16031.ll
+++ b/test/CodeGen/X86/pr16031.ll
@@ -2,9 +2,9 @@
 
 ; CHECK-LABEL: main:
 ; CHECK: pushl %esi
+; CHECK-NEXT: testb $1, 8(%esp)
 ; CHECK-NEXT: movl  $-12, %eax
 ; CHECK-NEXT: movl  $-1, %edx
-; CHECK-NEXT: testb $1, 8(%esp)
 ; CHECK-NEXT: cmovel    %edx, %eax
 ; CHECK-NEXT: xorl  %ecx, %ecx
 ; CHECK-NEXT: movl  %eax, %esi
diff --git a/test/CodeGen/X86/pr19049.ll b/test/CodeGen/X86/pr19049.ll
new file mode 100644
index 0000000..027c981
--- /dev/null
+++ b/test/CodeGen/X86/pr19049.ll
@@ -0,0 +1,7 @@
+; RUN: llc -mtriple x86_64-pc-linux %s -o - | FileCheck %s
+
+module asm ".pushsection foo"
+module asm ".popsection"
+
+; CHECK: .section	foo,"",@progbits
+; CHECK: .text
diff --git a/test/CodeGen/X86/preserve_allcc64.ll b/test/CodeGen/X86/preserve_allcc64.ll
new file mode 100644
index 0000000..545cd36
--- /dev/null
+++ b/test/CodeGen/X86/preserve_allcc64.ll
@@ -0,0 +1,104 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7     | FileCheck --check-prefix=SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
+
+define preserve_allcc void @preserve_allcc1() nounwind {
+entry:
+;SSE-LABEL: preserve_allcc1
+;SSE:       pushq %r10
+;SSE-NEXT:  pushq %r9
+;SSE-NEXT:  pushq %r8
+;SSE-NEXT:  pushq %rdi
+;SSE-NEXT:  pushq %rsi
+;SSE-NEXT:  pushq %rdx
+;SSE-NEXT:  pushq %rcx
+;SSE-NEXT:  pushq %rax
+;SSE-NEXT:  pushq %rbp
+;SSE-NEXT:  pushq %r15
+;SSE-NEXT:  pushq %r14
+;SSE-NEXT:  pushq %r13
+;SSE-NEXT:  pushq %r12
+;SSE-NEXT:  pushq %rbx
+;SSE:       movaps %xmm15
+;SSE-NEXT:  movaps %xmm14
+;SSE-NEXT:  movaps %xmm13
+;SSE-NEXT:  movaps %xmm12
+;SSE-NEXT:  movaps %xmm11
+;SSE-NEXT:  movaps %xmm10
+;SSE-NEXT:  movaps %xmm9
+;SSE-NEXT:  movaps %xmm8
+;SSE-NEXT:  movaps %xmm7
+;SSE-NEXT:  movaps %xmm6
+;SSE-NEXT:  movaps %xmm5
+;SSE-NEXT:  movaps %xmm4
+;SSE-NEXT:  movaps %xmm3
+;SSE-NEXT:  movaps %xmm2
+;SSE-NEXT:  movaps %xmm1
+;SSE-NEXT:  movaps %xmm0
+;AVX-LABEL: preserve_allcc1
+;AVX:       pushq %r10
+;AVX-NEXT:  pushq %r9
+;AVX-NEXT:  pushq %r8
+;AVX-NEXT:  pushq %rdi
+;AVX-NEXT:  pushq %rsi
+;AVX-NEXT:  pushq %rdx
+;AVX-NEXT:  pushq %rcx
+;AVX-NEXT:  pushq %rax
+;AVX-NEXT:  pushq %rbp
+;AVX-NEXT:  pushq %r15
+;AVX-NEXT:  pushq %r14
+;AVX-NEXT:  pushq %r13
+;AVX-NEXT:  pushq %r12
+;AVX-NEXT:  pushq %rbx
+;AVX:       vmovups %ymm15
+;AVX-NEXT:  vmovups %ymm14
+;AVX-NEXT:  vmovups %ymm13
+;AVX-NEXT:  vmovups %ymm12
+;AVX-NEXT:  vmovups %ymm11
+;AVX-NEXT:  vmovups %ymm10
+;AVX-NEXT:  vmovups %ymm9
+;AVX-NEXT:  vmovups %ymm8
+;AVX-NEXT:  vmovups %ymm7
+;AVX-NEXT:  vmovups %ymm6
+;AVX-NEXT:  vmovups %ymm5
+;AVX-NEXT:  vmovups %ymm4
+;AVX-NEXT:  vmovups %ymm3
+;AVX-NEXT:  vmovups %ymm2
+;AVX-NEXT:  vmovups %ymm1
+;AVX-NEXT:  vmovups %ymm0
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"()
+  ret void
+}
+
+; Make sure only R11 is saved before the call
+declare preserve_allcc void @bar(i64, i64, double, double)
+define void @preserve_allcc2() nounwind {
+entry:
+;SSE-LABEL: preserve_allcc2
+;SSE:       movq %r11, [[REG:%[a-z0-9]+]]
+;SSE-NOT:   movaps %xmm
+;SSE:       movq [[REG]], %r11
+  %a0 = call i64 asm sideeffect "", "={rax}"() nounwind
+  %a1 = call i64 asm sideeffect "", "={rcx}"() nounwind
+  %a2 = call i64 asm sideeffect "", "={rdx}"() nounwind
+  %a3 = call i64 asm sideeffect "", "={r8}"() nounwind
+  %a4 = call i64 asm sideeffect "", "={r9}"() nounwind
+  %a5 = call i64 asm sideeffect "", "={r10}"() nounwind
+  %a6 = call i64 asm sideeffect "", "={r11}"() nounwind
+  %a10 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+  %a11 = call <2 x double> asm sideeffect "", "={xmm3}"() nounwind
+  %a12 = call <2 x double> asm sideeffect "", "={xmm4}"() nounwind
+  %a13 = call <2 x double> asm sideeffect "", "={xmm5}"() nounwind
+  %a14 = call <2 x double> asm sideeffect "", "={xmm6}"() nounwind
+  %a15 = call <2 x double> asm sideeffect "", "={xmm7}"() nounwind
+  %a16 = call <2 x double> asm sideeffect "", "={xmm8}"() nounwind
+  %a17 = call <2 x double> asm sideeffect "", "={xmm9}"() nounwind
+  %a18 = call <2 x double> asm sideeffect "", "={xmm10}"() nounwind
+  %a19 = call <2 x double> asm sideeffect "", "={xmm11}"() nounwind
+  %a20 = call <2 x double> asm sideeffect "", "={xmm12}"() nounwind
+  %a21 = call <2 x double> asm sideeffect "", "={xmm13}"() nounwind
+  %a22 = call <2 x double> asm sideeffect "", "={xmm14}"() nounwind
+  %a23 = call <2 x double> asm sideeffect "", "={xmm15}"() nounwind
+  call preserve_allcc void @bar(i64 1, i64 2, double 3.0, double 4.0)
+  call void asm sideeffect "", "{rax},{rcx},{rdx},{r8},{r9},{r10},{r11},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},{xmm8},{xmm9},{xmm10},{xmm11},{xmm12},{xmm13},{xmm14},{xmm15}"(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, <2 x double> %a10, <2 x double> %a11, <2 x double> %a12, <2 x double> %a13, <2 x double> %a14, <2 x double> %a15, <2 x double> %a16, <2 x double> %a17, <2 x double> %a18, <2 x double> %a19, <2 x double> %a20, <2 x double> %a21, <2 x double> %a22, <2 x double> %a23)
+  ret void
+}
diff --git a/test/CodeGen/X86/preserve_mostcc64.ll b/test/CodeGen/X86/preserve_mostcc64.ll
new file mode 100644
index 0000000..4ee293e
--- /dev/null
+++ b/test/CodeGen/X86/preserve_mostcc64.ll
@@ -0,0 +1,86 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7     | FileCheck --check-prefix=SSE %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck --check-prefix=AVX %s
+
+; Every GPR should be saved - except r11
+define preserve_mostcc void @preserve_mostcc1() nounwind {
+entry:
+;SSE-LABEL: preserve_mostcc1
+;SSE:       pushq %r10
+;SSE-NEXT:  pushq %r9
+;SSE-NEXT:  pushq %r8
+;SSE-NEXT:  pushq %rdi
+;SSE-NEXT:  pushq %rsi
+;SSE-NEXT:  pushq %rdx
+;SSE-NEXT:  pushq %rcx
+;SSE-NEXT:  pushq %rax
+;SSE-NEXT:  pushq %rbp
+;SSE-NEXT:  pushq %r15
+;SSE-NEXT:  pushq %r14
+;SSE-NEXT:  pushq %r13
+;SSE-NEXT:  pushq %r12
+;SSE-NEXT:  pushq %rbx
+;AVX-LABEL: preserve_mostcc1
+;AVX:       pushq %r10
+;AVX-NEXT:  pushq %r9
+;AVX-NEXT:  pushq %r8
+;AVX-NEXT:  pushq %rdi
+;AVX-NEXT:  pushq %rsi
+;AVX-NEXT:  pushq %rdx
+;AVX-NEXT:  pushq %rcx
+;AVX-NEXT:  pushq %rax
+;AVX-NEXT:  pushq %rbp
+;AVX-NEXT:  pushq %r15
+;AVX-NEXT:  pushq %r14
+;AVX-NEXT:  pushq %r13
+;AVX-NEXT:  pushq %r12
+;AVX-NEXT:  pushq %rbx
+  call void asm sideeffect "", "~{rax},~{rbx},~{rcx},~{rdx},~{rsi},~{rdi},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15},~{rbp},~{xmm0},~{xmm1},~{xmm2},~{xmm3},~{xmm4},~{xmm5},~{xmm6},~{xmm7},~{xmm8},~{xmm9},~{xmm10},~{xmm11},~{xmm12},~{xmm13},~{xmm14},~{xmm15}"()
+  ret void
+}
+
+; Make sure R11 and XMMs are saved before the call
+declare preserve_mostcc void @foo(i64, i64, double, double)
+define void @preserve_mostcc2() nounwind {
+entry:
+;SSE-LABEL: preserve_mostcc2
+;SSE:       movq %r11, [[REG:%[a-z0-9]+]]
+;SSE:       movaps %xmm2
+;SSE:       movaps %xmm3
+;SSE:       movaps %xmm4
+;SSE:       movaps %xmm5
+;SSE:       movaps %xmm6
+;SSE:       movaps %xmm7
+;SSE:       movaps %xmm8
+;SSE:       movaps %xmm9
+;SSE:       movaps %xmm10
+;SSE:       movaps %xmm11
+;SSE:       movaps %xmm12
+;SSE:       movaps %xmm13
+;SSE:       movaps %xmm14
+;SSE:       movaps %xmm15
+;SSE:       movq [[REG]], %r11
+  %a0 = call i64 asm sideeffect "", "={rax}"() nounwind
+  %a1 = call i64 asm sideeffect "", "={rcx}"() nounwind
+  %a2 = call i64 asm sideeffect "", "={rdx}"() nounwind
+  %a3 = call i64 asm sideeffect "", "={r8}"() nounwind
+  %a4 = call i64 asm sideeffect "", "={r9}"() nounwind
+  %a5 = call i64 asm sideeffect "", "={r10}"() nounwind
+  %a6 = call i64 asm sideeffect "", "={r11}"() nounwind
+  %a10 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+  %a11 = call <2 x double> asm sideeffect "", "={xmm3}"() nounwind
+  %a12 = call <2 x double> asm sideeffect "", "={xmm4}"() nounwind
+  %a13 = call <2 x double> asm sideeffect "", "={xmm5}"() nounwind
+  %a14 = call <2 x double> asm sideeffect "", "={xmm6}"() nounwind
+  %a15 = call <2 x double> asm sideeffect "", "={xmm7}"() nounwind
+  %a16 = call <2 x double> asm sideeffect "", "={xmm8}"() nounwind
+  %a17 = call <2 x double> asm sideeffect "", "={xmm9}"() nounwind
+  %a18 = call <2 x double> asm sideeffect "", "={xmm10}"() nounwind
+  %a19 = call <2 x double> asm sideeffect "", "={xmm11}"() nounwind
+  %a20 = call <2 x double> asm sideeffect "", "={xmm12}"() nounwind
+  %a21 = call <2 x double> asm sideeffect "", "={xmm13}"() nounwind
+  %a22 = call <2 x double> asm sideeffect "", "={xmm14}"() nounwind
+  %a23 = call <2 x double> asm sideeffect "", "={xmm15}"() nounwind
+  call preserve_mostcc void @foo(i64 1, i64 2, double 3.0, double 4.0)
+  call void asm sideeffect "", "{rax},{rcx},{rdx},{r8},{r9},{r10},{r11},{xmm2},{xmm3},{xmm4},{xmm5},{xmm6},{xmm7},{xmm8},{xmm9},{xmm10},{xmm11},{xmm12},{xmm13},{xmm14},{xmm15}"(i64 %a0, i64 %a1, i64 %a2, i64 %a3, i64 %a4, i64 %a5, i64 %a6, <2 x double> %a10, <2 x double> %a11, <2 x double> %a12, <2 x double> %a13, <2 x double> %a14, <2 x double> %a15, <2 x double> %a16, <2 x double> %a17, <2 x double> %a18, <2 x double> %a19, <2 x double> %a20, <2 x double> %a21, <2 x double> %a22, <2 x double> %a23)
+  ret void
+}
diff --git a/test/CodeGen/X86/private-2.ll b/test/CodeGen/X86/private-2.ll
index 4413cee..cf2d741 100644
--- a/test/CodeGen/X86/private-2.ll
+++ b/test/CodeGen/X86/private-2.ll
@@ -2,7 +2,7 @@
 ; Quote should be outside of private prefix.
 ; rdar://6855766x
 
-; CHECK: L__ZZ20
+; CHECK: "l__ZZ20-[Example1 whatever]E4C.91"
 
 	%struct.A = type { i32*, i32 }
 @"_ZZ20-[Example1 whatever]E4C.91" = private constant %struct.A { i32* null, i32 1 }		; <%struct.A*> [#uses=1]
diff --git a/test/CodeGen/X86/ragreedy-bug.ll b/test/CodeGen/X86/ragreedy-bug.ll
new file mode 100644
index 0000000..df9b41d
--- /dev/null
+++ b/test/CodeGen/X86/ragreedy-bug.ll
@@ -0,0 +1,292 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
+
+; This testing case is reduced from 197.parser prune_match function.
+; We make sure register copies are not generated on isupper.exit blocks.
+
+; CHECK: isupper.exit
+; CHECK-NEXT: in Loop
+; CHECK-NEXT: testl
+; CHECK-NEXT: jne
+; CHECK: isupper.exit
+; CHECK-NEXT: in Loop
+; CHECK-NEXT: testl
+; CHECK-NEXT: je
+; CHECK: maskrune
+; CHECK: maskrune
+
+%struct.List_o_links_struct = type { i32, i32, i32, %struct.List_o_links_struct* }
+%struct.Connector_struct = type { i16, i16, i8, i8, %struct.Connector_struct*, i8* }
+%struct._RuneLocale = type { [8 x i8], [32 x i8], i32 (i8*, i64, i8**)*, i32 (i32, i8*, i64, i8**)*, i32, [256 x i32], [256 x i32], [256 x i32], %struct._RuneRange, %struct._RuneRange, %struct._RuneRange, i8*, i32, i32, %struct._RuneCharClass* }
+%struct._RuneRange = type { i32, %struct._RuneEntry* }
+%struct._RuneEntry = type { i32, i32, i32, i32* }
+%struct._RuneCharClass = type { [14 x i8], i32 }
+%struct.Exp_struct = type { i8, i8, i8, i8, %union.anon }
+%union.anon = type { %struct.E_list_struct* }
+%struct.E_list_struct = type { %struct.E_list_struct*, %struct.Exp_struct* }
+%struct.domain_struct = type { i8*, i32, %struct.List_o_links_struct*, i32, i32, %struct.d_tree_leaf_struct*, %struct.domain_struct* }
+%struct.d_tree_leaf_struct = type { %struct.domain_struct*, i32, %struct.d_tree_leaf_struct* }
+@_DefaultRuneLocale = external global %struct._RuneLocale
+declare i32 @__maskrune(i32, i64) #7
+define fastcc i32 @prune_match(%struct.Connector_struct* nocapture readonly %a, %struct.Connector_struct* nocapture readonly %b) #9 {
+entry:
+  %label56 = bitcast %struct.Connector_struct* %a to i16*
+  %0 = load i16* %label56, align 2
+  %label157 = bitcast %struct.Connector_struct* %b to i16*
+  %1 = load i16* %label157, align 2
+  %cmp = icmp eq i16 %0, %1
+  br i1 %cmp, label %if.end, label %return, !prof !988
+if.end:
+  %priority = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 2
+  %2 = load i8* %priority, align 1
+  %priority5 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 2
+  %3 = load i8* %priority5, align 1
+  %string = getelementptr inbounds %struct.Connector_struct* %a, i64 0, i32 5
+  %4 = load i8** %string, align 8
+  %string7 = getelementptr inbounds %struct.Connector_struct* %b, i64 0, i32 5
+  %5 = load i8** %string7, align 8
+  br label %while.cond
+while.cond:
+  %lsr.iv27 = phi i64 [ %lsr.iv.next28, %if.end17 ], [ 0, %if.end ]
+  %scevgep55 = getelementptr i8* %4, i64 %lsr.iv27
+  %6 = load i8* %scevgep55, align 1
+  %idxprom.i.i = sext i8 %6 to i64
+  %isascii.i.i224 = icmp sgt i8 %6, -1
+  br i1 %isascii.i.i224, label %cond.true.i.i, label %cond.false.i.i, !prof !181
+cond.true.i.i:
+  %arrayidx.i.i = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i
+  %7 = load i32* %arrayidx.i.i, align 4
+  %and.i.i = and i32 %7, 32768
+  br label %isupper.exit
+cond.false.i.i:
+  %8 = trunc i64 %idxprom.i.i to i8
+  %conv8 = sext i8 %8 to i32
+  %call3.i.i = tail call i32 @__maskrune(i32 %conv8, i64 32768) #3
+  br label %isupper.exit
+isupper.exit:
+  %tobool1.sink.i.in.i = phi i32 [ %and.i.i, %cond.true.i.i ], [ %call3.i.i, %cond.false.i.i ]
+  %tobool1.sink.i.i = icmp eq i32 %tobool1.sink.i.in.i, 0
+  br i1 %tobool1.sink.i.i, label %lor.rhs, label %while.body, !prof !989
+lor.rhs:
+  %sunkaddr = ptrtoint i8* %5 to i64
+  %sunkaddr58 = add i64 %sunkaddr, %lsr.iv27
+  %sunkaddr59 = inttoptr i64 %sunkaddr58 to i8*
+  %9 = load i8* %sunkaddr59, align 1
+  %idxprom.i.i214 = sext i8 %9 to i64
+  %isascii.i.i213225 = icmp sgt i8 %9, -1
+  br i1 %isascii.i.i213225, label %cond.true.i.i217, label %cond.false.i.i219, !prof !181
+cond.true.i.i217:
+  %arrayidx.i.i215 = getelementptr inbounds %struct._RuneLocale* @_DefaultRuneLocale, i64 0, i32 5, i64 %idxprom.i.i214
+  %10 = load i32* %arrayidx.i.i215, align 4
+  %and.i.i216 = and i32 %10, 32768
+  br label %isupper.exit223
+cond.false.i.i219:
+  %11 = trunc i64 %idxprom.i.i214 to i8
+  %conv9 = sext i8 %11 to i32
+  %call3.i.i218 = tail call i32 @__maskrune(i32 %conv9, i64 32768) #3
+  br label %isupper.exit223
+isupper.exit223:
+  %tobool1.sink.i.in.i220 = phi i32 [ %and.i.i216, %cond.true.i.i217 ], [ %call3.i.i218, %cond.false.i.i219 ]
+  %tobool1.sink.i.i221 = icmp eq i32 %tobool1.sink.i.in.i220, 0
+  br i1 %tobool1.sink.i.i221, label %while.end, label %while.body, !prof !990
+while.body:
+  %sunkaddr60 = ptrtoint i8* %4 to i64
+  %sunkaddr61 = add i64 %sunkaddr60, %lsr.iv27
+  %sunkaddr62 = inttoptr i64 %sunkaddr61 to i8*
+  %12 = load i8* %sunkaddr62, align 1
+  %sunkaddr63 = ptrtoint i8* %5 to i64
+  %sunkaddr64 = add i64 %sunkaddr63, %lsr.iv27
+  %sunkaddr65 = inttoptr i64 %sunkaddr64 to i8*
+  %13 = load i8* %sunkaddr65, align 1
+  %cmp14 = icmp eq i8 %12, %13
+  br i1 %cmp14, label %if.end17, label %return, !prof !991
+if.end17:
+  %lsr.iv.next28 = add i64 %lsr.iv27, 1
+  br label %while.cond
+while.end:
+  %14 = or i8 %3, %2
+  %15 = icmp eq i8 %14, 0
+  br i1 %15, label %if.then23, label %if.else88, !prof !992
+if.then23:
+  %sunkaddr66 = ptrtoint %struct.Connector_struct* %a to i64
+  %sunkaddr67 = add i64 %sunkaddr66, 16
+  %sunkaddr68 = inttoptr i64 %sunkaddr67 to i8**
+  %16 = load i8** %sunkaddr68, align 8
+  %17 = load i8* %16, align 1
+  %cmp26 = icmp eq i8 %17, 83
+  %sunkaddr69 = ptrtoint i8* %4 to i64
+  %sunkaddr70 = add i64 %sunkaddr69, %lsr.iv27
+  %sunkaddr71 = inttoptr i64 %sunkaddr70 to i8*
+  %18 = load i8* %sunkaddr71, align 1
+  br i1 %cmp26, label %land.lhs.true28, label %while.cond59.preheader, !prof !993
+land.lhs.true28:
+  switch i8 %18, label %land.rhs.preheader [
+    i8 112, label %land.lhs.true35
+    i8 0, label %return
+  ], !prof !994
+land.lhs.true35:
+  %sunkaddr72 = ptrtoint i8* %5 to i64
+  %sunkaddr73 = add i64 %sunkaddr72, %lsr.iv27
+  %sunkaddr74 = inttoptr i64 %sunkaddr73 to i8*
+  %19 = load i8* %sunkaddr74, align 1
+  switch i8 %19, label %land.rhs.preheader [
+    i8 112, label %land.lhs.true43
+  ], !prof !995
+land.lhs.true43:
+  %20 = ptrtoint i8* %16 to i64
+  %21 = sub i64 0, %20
+  %scevgep52 = getelementptr i8* %4, i64 %21
+  %scevgep53 = getelementptr i8* %scevgep52, i64 %lsr.iv27
+  %scevgep54 = getelementptr i8* %scevgep53, i64 -1
+  %cmp45 = icmp eq i8* %scevgep54, null
+  br i1 %cmp45, label %return, label %lor.lhs.false47, !prof !996
+lor.lhs.false47:
+  %22 = ptrtoint i8* %16 to i64
+  %23 = sub i64 0, %22
+  %scevgep47 = getelementptr i8* %4, i64 %23
+  %scevgep48 = getelementptr i8* %scevgep47, i64 %lsr.iv27
+  %scevgep49 = getelementptr i8* %scevgep48, i64 -2
+  %cmp50 = icmp eq i8* %scevgep49, null
+  br i1 %cmp50, label %land.lhs.true52, label %while.cond59.preheader, !prof !997
+land.lhs.true52:
+  %sunkaddr75 = ptrtoint i8* %4 to i64
+  %sunkaddr76 = add i64 %sunkaddr75, %lsr.iv27
+  %sunkaddr77 = add i64 %sunkaddr76, -1
+  %sunkaddr78 = inttoptr i64 %sunkaddr77 to i8*
+  %24 = load i8* %sunkaddr78, align 1
+  %cmp55 = icmp eq i8 %24, 73
+  %cmp61233 = icmp eq i8 %18, 0
+  %or.cond265 = or i1 %cmp55, %cmp61233
+  br i1 %or.cond265, label %return, label %land.rhs.preheader, !prof !998
+while.cond59.preheader:
+  %cmp61233.old = icmp eq i8 %18, 0
+  br i1 %cmp61233.old, label %return, label %land.rhs.preheader, !prof !999
+land.rhs.preheader:
+  %scevgep33 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep43 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs
+land.rhs:
+  %lsr.iv = phi i64 [ 0, %land.rhs.preheader ], [ %lsr.iv.next, %if.then83 ]
+  %25 = phi i8 [ %27, %if.then83 ], [ %18, %land.rhs.preheader ]
+  %scevgep34 = getelementptr i8* %scevgep33, i64 %lsr.iv
+  %26 = load i8* %scevgep34, align 1
+  %cmp64 = icmp eq i8 %26, 0
+  br i1 %cmp64, label %return, label %while.body66, !prof !1000
+while.body66:
+  %cmp68 = icmp eq i8 %25, 42
+  %cmp72 = icmp eq i8 %26, 42
+  %or.cond = or i1 %cmp68, %cmp72
+  br i1 %or.cond, label %if.then83, label %lor.lhs.false74, !prof !1001
+lor.lhs.false74:
+  %cmp77 = icmp ne i8 %25, %26
+  %cmp81 = icmp eq i8 %25, 94
+  %or.cond208 = or i1 %cmp77, %cmp81
+  br i1 %or.cond208, label %return, label %if.then83, !prof !1002
+if.then83:
+  %scevgep44 = getelementptr i8* %scevgep43, i64 %lsr.iv
+  %scevgep45 = getelementptr i8* %scevgep44, i64 1
+  %27 = load i8* %scevgep45, align 1
+  %cmp61 = icmp eq i8 %27, 0
+  %lsr.iv.next = add i64 %lsr.iv, 1
+  br i1 %cmp61, label %return, label %land.rhs, !prof !999
+if.else88:
+  %cmp89 = icmp eq i8 %2, 1
+  %cmp92 = icmp eq i8 %3, 2
+  %or.cond159 = and i1 %cmp89, %cmp92
+  br i1 %or.cond159, label %while.cond95.preheader, label %if.else123, !prof !1003
+while.cond95.preheader:
+  %sunkaddr79 = ptrtoint i8* %4 to i64
+  %sunkaddr80 = add i64 %sunkaddr79, %lsr.iv27
+  %sunkaddr81 = inttoptr i64 %sunkaddr80 to i8*
+  %28 = load i8* %sunkaddr81, align 1
+  %cmp97238 = icmp eq i8 %28, 0
+  br i1 %cmp97238, label %return, label %land.rhs99.preheader, !prof !1004
+land.rhs99.preheader:
+  %scevgep31 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep40 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs99
+land.rhs99:
+  %lsr.iv17 = phi i64 [ 0, %land.rhs99.preheader ], [ %lsr.iv.next18, %if.then117 ]
+  %29 = phi i8 [ %31, %if.then117 ], [ %28, %land.rhs99.preheader ]
+  %scevgep32 = getelementptr i8* %scevgep31, i64 %lsr.iv17
+  %30 = load i8* %scevgep32, align 1
+  %cmp101 = icmp eq i8 %30, 0
+  br i1 %cmp101, label %return, label %while.body104, !prof !1005
+while.body104:
+  %cmp107 = icmp eq i8 %29, %30
+  %cmp111 = icmp eq i8 %29, 42
+  %or.cond209 = or i1 %cmp107, %cmp111
+  %cmp115 = icmp eq i8 %30, 94
+  %or.cond210 = or i1 %or.cond209, %cmp115
+  br i1 %or.cond210, label %if.then117, label %return, !prof !1006
+if.then117:
+  %scevgep41 = getelementptr i8* %scevgep40, i64 %lsr.iv17
+  %scevgep42 = getelementptr i8* %scevgep41, i64 1
+  %31 = load i8* %scevgep42, align 1
+  %cmp97 = icmp eq i8 %31, 0
+  %lsr.iv.next18 = add i64 %lsr.iv17, 1
+  br i1 %cmp97, label %return, label %land.rhs99, !prof !1004
+if.else123:
+  %cmp124 = icmp eq i8 %3, 1
+  %cmp127 = icmp eq i8 %2, 2
+  %or.cond160 = and i1 %cmp124, %cmp127
+  br i1 %or.cond160, label %while.cond130.preheader, label %return, !prof !1007
+while.cond130.preheader:
+  %sunkaddr82 = ptrtoint i8* %4 to i64
+  %sunkaddr83 = add i64 %sunkaddr82, %lsr.iv27
+  %sunkaddr84 = inttoptr i64 %sunkaddr83 to i8*
+  %32 = load i8* %sunkaddr84, align 1
+  %cmp132244 = icmp eq i8 %32, 0
+  br i1 %cmp132244, label %return, label %land.rhs134.preheader, !prof !1008
+land.rhs134.preheader:
+  %scevgep29 = getelementptr i8* %5, i64 %lsr.iv27
+  %scevgep37 = getelementptr i8* %4, i64 %lsr.iv27
+  br label %land.rhs134
+land.rhs134:
+  %lsr.iv22 = phi i64 [ 0, %land.rhs134.preheader ], [ %lsr.iv.next23, %if.then152 ]
+  %33 = phi i8 [ %35, %if.then152 ], [ %32, %land.rhs134.preheader ]
+  %scevgep30 = getelementptr i8* %scevgep29, i64 %lsr.iv22
+  %34 = load i8* %scevgep30, align 1
+  %cmp136 = icmp eq i8 %34, 0
+  br i1 %cmp136, label %return, label %while.body139, !prof !1009
+while.body139:
+  %cmp142 = icmp eq i8 %33, %34
+  %cmp146 = icmp eq i8 %34, 42
+  %or.cond211 = or i1 %cmp142, %cmp146
+  %cmp150 = icmp eq i8 %33, 94
+  %or.cond212 = or i1 %or.cond211, %cmp150
+  br i1 %or.cond212, label %if.then152, label %return, !prof !1010
+if.then152:
+  %scevgep38 = getelementptr i8* %scevgep37, i64 %lsr.iv22
+  %scevgep39 = getelementptr i8* %scevgep38, i64 1
+  %35 = load i8* %scevgep39, align 1
+  %cmp132 = icmp eq i8 %35, 0
+  %lsr.iv.next23 = add i64 %lsr.iv22, 1
+  br i1 %cmp132, label %return, label %land.rhs134, !prof !1008
+return:
+  %retval.0 = phi i32 [ 0, %entry ], [ 1, %land.lhs.true52 ], [ 1, %land.lhs.true43 ], [ 0, %if.else123 ], [ 1, %while.cond59.preheader ], [ 1, %while.cond95.preheader ], [ 1, %while.cond130.preheader ], [ 1, %land.lhs.true28 ], [ 1, %if.then83 ], [ 0, %lor.lhs.false74 ], [ 1, %land.rhs ], [ 1, %if.then117 ], [ 0, %while.body104 ], [ 1, %land.rhs99 ], [ 1, %if.then152 ], [ 0, %while.body139 ], [ 1, %land.rhs134 ], [ 0, %while.body ]
+  ret i32 %retval.0
+}
+!181 = metadata !{metadata !"branch_weights", i32 662038, i32 1}
+!988 = metadata !{metadata !"branch_weights", i32 12091450, i32 1916}
+!989 = metadata !{metadata !"branch_weights", i32 7564670, i32 4526781}
+!990 = metadata !{metadata !"branch_weights", i32 7484958, i32 13283499}
+!991 = metadata !{metadata !"branch_weights", i32 8677007, i32 4606493}
+!992 = metadata !{metadata !"branch_weights", i32 -1172426948, i32 145094705}
+!993 = metadata !{metadata !"branch_weights", i32 1468914, i32 5683688}
+!994 = metadata !{metadata !"branch_weights", i32 114025221, i32 -1217548794, i32 -1199521551, i32 87712616}
+!995 = metadata !{metadata !"branch_weights", i32 1853716452, i32 -444717951, i32 932776759}
+!996 = metadata !{metadata !"branch_weights", i32 1004870, i32 20259}
+!997 = metadata !{metadata !"branch_weights", i32 20071, i32 189}
+!998 = metadata !{metadata !"branch_weights", i32 -1020255939, i32 572177766}
+!999 = metadata !{metadata !"branch_weights", i32 2666513, i32 3466431}
+!1000 = metadata !{metadata !"branch_weights", i32 5117635, i32 1859780}
+!1001 = metadata !{metadata !"branch_weights", i32 354902465, i32 -1444604407}
+!1002 = metadata !{metadata !"branch_weights", i32 -1762419279, i32 1592770684}
+!1003 = metadata !{metadata !"branch_weights", i32 1435905930, i32 -1951930624}
+!1004 = metadata !{metadata !"branch_weights", i32 1, i32 504888}
+!1005 = metadata !{metadata !"branch_weights", i32 94662, i32 504888}
+!1006 = metadata !{metadata !"branch_weights", i32 -1897793104, i32 160196332}
+!1007 = metadata !{metadata !"branch_weights", i32 2074643678, i32 -29579071}
+!1008 = metadata !{metadata !"branch_weights", i32 1, i32 226163}
+!1009 = metadata !{metadata !"branch_weights", i32 58357, i32 226163}
+!1010 = metadata !{metadata !"branch_weights", i32 -2072848646, i32 92907517}
diff --git a/test/CodeGen/X86/ragreedy-hoist-spill.ll b/test/CodeGen/X86/ragreedy-hoist-spill.ll
new file mode 100644
index 0000000..c6b28f7
--- /dev/null
+++ b/test/CodeGen/X86/ragreedy-hoist-spill.ll
@@ -0,0 +1,389 @@
+; RUN: llc < %s -mtriple=x86_64-apple-macosx -regalloc=greedy | FileCheck %s
+
+; This testing case is reduced from 254.gap SyFgets funciton.
+; We make sure a spill is not hoisted to a hotter outer loop.
+
+%struct.TMP.1 = type { %struct.TMP.2*, %struct.TMP.2*, [1024 x i8] }
+%struct.TMP.2 = type { i8*, i32, i32, i16, i16, %struct.TMP.3, i32, i8*, i32 (i8*)*, i32 (i8*, i8*, i32)*, i64 (i8*, i64, i32)*, i32 (i8*, i8*, i32)*, %struct.TMP.3, %struct.TMP.4*, i32, [3 x i8], [1 x i8], %struct.TMP.3, i32, i64 }
+%struct.TMP.4 = type opaque
+%struct.TMP.3 = type { i8*, i32 }
+
+@syBuf = external global [16 x %struct.TMP.1], align 16
+@syHistory = external global [8192 x i8], align 16
+@SyFgets.yank = external global [512 x i8], align 16
+@syCTRO = external global i32, align 4
+
+; CHECK-LABEL: SyFgets
+define i8* @SyFgets(i8* %line, i64 %length, i64 %fid) {
+entry:
+  %sub.ptr.rhs.cast646 = ptrtoint i8* %line to i64
+  %old = alloca [512 x i8], align 16
+  %0 = getelementptr inbounds [512 x i8]* %old, i64 0, i64 0
+  switch i64 %fid, label %if.then [
+    i64 2, label %if.end
+    i64 0, label %if.end
+  ]
+
+if.then:
+  br label %cleanup
+
+if.end:
+  switch i64 undef, label %if.end25 [
+    i64 0, label %if.then4
+    i64 1, label %land.lhs.true14
+  ]
+
+if.then4:
+  br i1 undef, label %SyTime.exit, label %if.then.i
+
+if.then.i:
+  unreachable
+
+SyTime.exit:
+  br i1 undef, label %SyTime.exit2681, label %if.then.i2673
+
+if.then.i2673:
+  unreachable
+
+SyTime.exit2681:
+  br label %cleanup
+
+land.lhs.true14:
+  unreachable
+
+if.end25:
+  br i1 undef, label %SyTime.exit2720, label %if.then.i2712
+
+if.then.i2712:
+  unreachable
+
+SyTime.exit2720:
+  %add.ptr = getelementptr [512 x i8]* %old, i64 0, i64 512
+  %cmp293427 = icmp ult i8* %0, %add.ptr
+  br i1 %cmp293427, label %for.body.lr.ph, label %while.body.preheader
+
+for.body.lr.ph:
+  call void @llvm.memset.p0i8.i64(i8* undef, i8 32, i64 512, i32 16, i1 false)
+  br label %while.body.preheader
+
+while.body.preheader:
+  %add.ptr1603 = getelementptr [512 x i8]* null, i64 0, i64 512
+  %echo.i3101 = getelementptr [16 x %struct.TMP.1]* @syBuf, i64 0, i64 %fid, i32 1
+  %1 = xor i64 %sub.ptr.rhs.cast646, -1
+  br label %do.body
+
+do.body:
+  %ch2.0 = phi i32 [ 0, %while.body.preheader ], [ %ch.12.ch2.12, %do.body ]
+  %rep.0 = phi i32 [ 1, %while.body.preheader ], [ %rep.6, %do.body ]
+  store i32 0, i32* @syCTRO, align 4, !tbaa !1
+  %ch.0.ch2.0 = select i1 undef, i32 14, i32 %ch2.0
+  %ch2.2 = select i1 undef, i32 0, i32 %ch.0.ch2.0
+  %ch.2.ch2.2 = select i1 undef, i32 0, i32 %ch2.2
+  %ch2.4 = select i1 undef, i32 278, i32 %ch.2.ch2.2
+  %ch2.5 = select i1 undef, i32 0, i32 %ch2.4
+  %rep.2 = select i1 undef, i32 undef, i32 %rep.0
+  %ch.5.ch2.5 = select i1 undef, i32 undef, i32 %ch2.5
+  %ch2.7 = select i1 undef, i32 0, i32 %ch.5.ch2.5
+  %rep.3 = select i1 undef, i32 undef, i32 %rep.2
+  %ch.7.ch2.7 = select i1 false, i32 0, i32 %ch2.7
+  %mul98.rep.3 = select i1 false, i32 0, i32 %rep.3
+  %ch2.9 = select i1 undef, i32 undef, i32 %ch.7.ch2.7
+  %rep.5 = select i1 undef, i32 undef, i32 %mul98.rep.3
+  %ch2.10 = select i1 false, i32 undef, i32 %ch2.9
+  %rep.6 = select i1 false, i32 undef, i32 %rep.5
+  %isdigittmp = add i32 %ch2.10, -48
+  %isdigit = icmp ult i32 %isdigittmp, 10
+  %cmp119 = icmp eq i32 undef, 22
+  %or.cond1875 = and i1 %isdigit, %cmp119
+  %ch.10.ch2.10 = select i1 %or.cond1875, i32 undef, i32 %ch2.10
+  %.ch.10 = select i1 %or.cond1875, i32 0, i32 undef
+  %ch2.12 = select i1 undef, i32 %.ch.10, i32 %ch.10.ch2.10
+  %ch.12 = select i1 undef, i32 0, i32 %.ch.10
+  %ch.12.ch2.12 = select i1 false, i32 %ch.12, i32 %ch2.12
+  %.ch.12 = select i1 false, i32 0, i32 %ch.12
+  %cmp147 = icmp eq i32 %.ch.12, 0
+  br i1 %cmp147, label %do.body, label %do.end
+
+do.end:
+  %cmp164 = icmp eq i32 %ch.12.ch2.12, 21
+  %mul167 = shl i32 %rep.6, 2
+  %rep.8 = select i1 %cmp164, i32 %mul167, i32 %rep.6
+  %..ch.19 = select i1 false, i32 2, i32 0
+  br i1 undef, label %while.body200, label %while.end1465
+
+while.body200:
+  %dec3386.in = phi i32 [ %dec3386, %while.cond197.backedge ], [ %rep.8, %do.end ]
+  %oldc.13384 = phi i32 [ %oldc.1.be, %while.cond197.backedge ], [ 0, %do.end ]
+  %ch.213379 = phi i32 [ %last.1.be, %while.cond197.backedge ], [ %..ch.19, %do.end ]
+  %last.13371 = phi i32 [ %last.1.be, %while.cond197.backedge ], [ 0, %do.end ]
+  %dec3386 = add i32 %dec3386.in, -1
+  switch i32 %ch.213379, label %sw.default [
+    i32 1, label %while.cond201.preheader
+    i32 322, label %sw.bb206
+    i32 354, label %sw.bb206
+    i32 2, label %sw.bb243
+    i32 364, label %sw.bb1077
+    i32 326, label %sw.bb256
+    i32 358, label %sw.bb256
+    i32 341, label %sw.bb979
+    i32 323, label %while.cond1037.preheader
+    i32 373, label %sw.bb979
+    i32 4, label %if.then1477
+    i32 332, label %sw.bb1077
+    i32 11, label %for.cond357
+    i32 355, label %while.cond1037.preheader
+    i32 324, label %sw.bb474
+    i32 356, label %sw.bb474
+    i32 20, label %sw.bb566
+    i32 -1, label %while.cond197.backedge
+    i32 268, label %sw.bb1134
+    i32 16, label %while.cond635.preheader
+    i32 18, label %sw.bb956
+    i32 316, label %while.cond864
+  ]
+
+while.cond1037.preheader:
+  %cmp10393273 = icmp eq i8 undef, 0
+  br i1 %cmp10393273, label %if.end1070, label %land.rhs1041
+
+while.cond635.preheader:
+  br i1 undef, label %for.body643.us, label %while.cond661
+
+for.body643.us:
+  br label %for.body643.us
+
+while.cond201.preheader:
+  %umax = select i1 false, i64 undef, i64 %1
+  %2 = xor i64 %umax, -1
+  %3 = inttoptr i64 %2 to i8*
+  br label %while.cond197.backedge
+
+sw.bb206:
+  br label %while.cond197.backedge
+
+sw.bb243:
+  br label %while.cond197.backedge
+
+sw.bb256:
+  br label %while.cond197.backedge
+
+while.cond197.backedge:
+  %last.1.be = phi i32 [ %ch.213379, %sw.default ], [ -1, %while.body200 ], [ %ch.213379, %sw.bb1077 ], [ %ch.213379, %sw.bb979 ], [ 18, %sw.bb956 ], [ 20, %sw.bb566 ], [ %ch.213379, %for.end552 ], [ %ch.213379, %sw.bb256 ], [ 2, %sw.bb243 ], [ 1, %while.cond201.preheader ], [ 268, %for.cond1145.preheader ], [ %ch.213379, %sw.bb206 ]
+  %oldc.1.be = phi i32 [ %oldc.13384, %sw.default ], [ %oldc.13384, %while.body200 ], [ %oldc.13384, %sw.bb1077 ], [ %oldc.13384, %sw.bb979 ], [ %oldc.13384, %sw.bb956 ], [ %oldc.13384, %sw.bb566 ], [ %oldc.13384, %for.end552 ], [ %oldc.13384, %sw.bb256 ], [ %oldc.13384, %sw.bb243 ], [ %oldc.13384, %while.cond201.preheader ], [ 0, %for.cond1145.preheader ], [ %oldc.13384, %sw.bb206 ]
+  %cmp198 = icmp sgt i32 %dec3386, 0
+  br i1 %cmp198, label %while.body200, label %while.end1465
+
+for.cond357:
+  br label %for.cond357
+
+sw.bb474:
+  %cmp476 = icmp eq i8 undef, 0
+  br i1 %cmp476, label %if.end517, label %do.body479.preheader
+
+do.body479.preheader:
+  %cmp4833314 = icmp eq i8 undef, 0
+  br i1 %cmp4833314, label %if.end517, label %land.rhs485
+
+land.rhs485:
+  %incdec.ptr4803316 = phi i8* [ %incdec.ptr480, %do.body479.backedge.land.rhs485_crit_edge ], [ undef, %do.body479.preheader ]
+  %isascii.i.i27763151 = icmp sgt i8 undef, -1
+  br i1 %isascii.i.i27763151, label %cond.true.i.i2780, label %cond.false.i.i2782
+
+cond.true.i.i2780:
+  br i1 undef, label %land.lhs.true490, label %lor.rhs500
+
+cond.false.i.i2782:
+  unreachable
+
+land.lhs.true490:
+  br i1 false, label %lor.rhs500, label %do.body479.backedge
+
+lor.rhs500:
+  ; CHECK: lor.rhs500
+  ; Make sure that we don't hoist the spill to outer loops.
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: movq %r{{.*}}, {{[0-9]+}}(%rsp)
+  ; CHECK: callq {{.*}}maskrune
+  %call3.i.i2792 = call i32 @__maskrune(i32 undef, i64 256)
+  br i1 undef, label %land.lhs.true504, label %do.body479.backedge
+
+land.lhs.true504:
+  br i1 undef, label %do.body479.backedge, label %if.end517
+
+do.body479.backedge:
+  %incdec.ptr480 = getelementptr i8* %incdec.ptr4803316, i64 1
+  %cmp483 = icmp eq i8 undef, 0
+  br i1 %cmp483, label %if.end517, label %do.body479.backedge.land.rhs485_crit_edge
+
+do.body479.backedge.land.rhs485_crit_edge:
+  br label %land.rhs485
+
+if.end517:
+  %q.4 = phi i8* [ undef, %sw.bb474 ], [ undef, %do.body479.preheader ], [ %incdec.ptr480, %do.body479.backedge ], [ %incdec.ptr4803316, %land.lhs.true504 ]
+  switch i32 %last.13371, label %if.then532 [
+    i32 383, label %for.cond534
+    i32 356, label %for.cond534
+    i32 324, label %for.cond534
+    i32 24, label %for.cond534
+    i32 11, label %for.cond534
+  ]
+
+if.then532:
+  store i8 0, i8* getelementptr inbounds ([512 x i8]* @SyFgets.yank, i64 0, i64 0), align 16, !tbaa !5
+  br label %for.cond534
+
+for.cond534:
+  %cmp536 = icmp eq i8 undef, 0
+  br i1 %cmp536, label %for.cond542.preheader, label %for.cond534
+
+for.cond542.preheader:
+  br i1 undef, label %for.body545, label %for.end552
+
+for.body545:
+  br i1 undef, label %for.end552, label %for.body545
+
+for.end552:
+  %s.2.lcssa = phi i8* [ undef, %for.cond542.preheader ], [ %q.4, %for.body545 ]
+  %sub.ptr.lhs.cast553 = ptrtoint i8* %s.2.lcssa to i64
+  %sub.ptr.sub555 = sub i64 %sub.ptr.lhs.cast553, 0
+  %arrayidx556 = getelementptr i8* null, i64 %sub.ptr.sub555
+  store i8 0, i8* %arrayidx556, align 1, !tbaa !5
+  br label %while.cond197.backedge
+
+sw.bb566:
+  br label %while.cond197.backedge
+
+while.cond661:
+  br label %while.cond661
+
+while.cond864:
+  br label %while.cond864
+
+sw.bb956:
+  br i1 undef, label %if.then959, label %while.cond197.backedge
+
+if.then959:
+  br label %while.cond962
+
+while.cond962:
+  br label %while.cond962
+
+sw.bb979:
+  br label %while.cond197.backedge
+
+land.rhs1041:
+  unreachable
+
+if.end1070:
+  br label %sw.bb1077
+
+sw.bb1077:
+  br label %while.cond197.backedge
+
+sw.bb1134:
+  br i1 false, label %for.body1139, label %for.cond1145.preheader
+
+for.cond1145.preheader:
+  br i1 %cmp293427, label %for.body1150.lr.ph, label %while.cond197.backedge
+
+for.body1150.lr.ph:
+  unreachable
+
+for.body1139:
+  unreachable
+
+sw.default:
+  br label %while.cond197.backedge
+
+while.end1465:
+  %oldc.1.lcssa = phi i32 [ 0, %do.end ], [ %oldc.1.be, %while.cond197.backedge ]
+  %ch.21.lcssa = phi i32 [ %..ch.19, %do.end ], [ %last.1.be, %while.cond197.backedge ]
+  switch i32 %ch.21.lcssa, label %for.cond1480.preheader [
+    i32 -1, label %if.then1477
+    i32 15, label %if.then1477
+    i32 13, label %if.then1477
+    i32 10, label %if.then1477
+  ]
+
+for.cond1480.preheader:
+  br i1 undef, label %for.body1606.lr.ph, label %for.end1609
+
+if.then1477:
+  %p.1.lcssa3539 = phi i8* [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ null, %while.end1465 ], [ %line, %while.body200 ]
+  %call1.i3057 = call i64 @"\01_write"(i32 undef, i8* undef, i64 1)
+  %sub.ptr.lhs.cast1717 = ptrtoint i8* %p.1.lcssa3539 to i64
+  %sub.ptr.sub1719 = sub i64 %sub.ptr.lhs.cast1717, %sub.ptr.rhs.cast646
+  %idx.neg1727 = sub i64 0, %sub.ptr.sub1719
+  br label %for.body1723
+
+for.body1606.lr.ph:
+  br label %for.end1609
+
+for.end1609:
+  br i1 undef, label %for.cond1659.preheader, label %land.lhs.true1614
+
+land.lhs.true1614:
+  br label %for.cond1659.preheader
+
+for.cond1659.preheader:
+  %cmp16623414 = icmp ult i8* undef, %add.ptr1603
+  br i1 %cmp16623414, label %for.body1664.lr.ph, label %while.body1703.lr.ph
+
+for.body1664.lr.ph:
+  %cmp16773405 = icmp slt i64 undef, undef
+  br i1 %cmp16773405, label %while.body1679, label %while.cond1683.preheader
+
+while.body1703.lr.ph:
+  unreachable
+
+while.cond1683.preheader:
+  br i1 undef, label %while.body1691, label %while.end1693
+
+while.body1679:
+  %oldc.43406 = phi i32 [ %inc, %syEchoch.exit3070 ], [ %oldc.1.lcssa, %for.body1664.lr.ph ]
+  %4 = load %struct.TMP.2** %echo.i3101, align 8, !tbaa !6
+  %call.i3062 = call i32 @fileno(%struct.TMP.2* %4)
+  br i1 undef, label %if.then.i3069, label %syEchoch.exit3070
+
+if.then.i3069:
+  br label %syEchoch.exit3070
+
+syEchoch.exit3070:
+  %inc = add i32 %oldc.43406, 1
+  %conv1672 = sext i32 %inc to i64
+  %cmp1677 = icmp slt i64 %conv1672, undef
+  br i1 %cmp1677, label %while.body1679, label %while.cond1683.preheader
+
+while.body1691:
+  unreachable
+
+while.end1693:
+  unreachable
+
+for.body1723:
+  %q.303203 = phi i8* [ getelementptr inbounds ([8192 x i8]* @syHistory, i64 0, i64 8189), %if.then1477 ], [ %incdec.ptr1730, %for.body1723 ]
+  %add.ptr1728 = getelementptr i8* %q.303203, i64 %idx.neg1727
+  %5 = load i8* %add.ptr1728, align 1, !tbaa !5
+  %incdec.ptr1730 = getelementptr i8* %q.303203, i64 -1
+  br label %for.body1723
+
+cleanup:
+  ret i8* undef
+}
+
+declare i32 @fileno(%struct.TMP.2* nocapture)
+declare i64 @"\01_write"(i32, i8*, i64)
+declare i32 @__maskrune(i32, i64)
+declare void @llvm.memset.p0i8.i64(i8* nocapture, i8, i64, i32, i1)
+
+!llvm.ident = !{!0}
+
+!0 = metadata !{metadata !"clang version 3.5.0 (trunk 204257)"}
+!1 = metadata !{metadata !2, metadata !2, i64 0}
+!2 = metadata !{metadata !"int", metadata !3, i64 0}
+!3 = metadata !{metadata !"omnipotent char", metadata !4, i64 0}
+!4 = metadata !{metadata !"Simple C/C++ TBAA"}
+!5 = metadata !{metadata !3, metadata !3, i64 0}
+!6 = metadata !{metadata !7, metadata !8, i64 8}
+!7 = metadata !{metadata !"", metadata !8, i64 0, metadata !8, i64 8, metadata !3, i64 16}
+!8 = metadata !{metadata !"any pointer", metadata !3, i64 0}
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
new file mode 100644
index 0000000..f3669fb
--- /dev/null
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -0,0 +1,168 @@
+; RUN: llc -regalloc=greedy -relocation-model=pic  < %s 2>&1 | FileCheck %s
+; Without the last chance recoloring, this test fails with:
+; "ran out of registers".
+
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx"
+
+@fp_dh_36985b17790d59a27994eaab5dcb00ee = external constant [499 x i32]
+@fp_dh_18716afa4a5354de0a302c8edb3b0ee1 = external global i32
+@fp_dh_20a33cdeefab8f4c8887e82766cb9dcb = external global i8*
+@fp_dh_9d93c897906e39883c58b034c8e786b2 = external global [5419648 x i8], align 16
+
+; Function Attrs: nounwind ssp
+; CHECK-NOT: ran out of registers during register allocation
+define void @fp_dh_f870bf31fd8ffe068450366e3f05389a(i8* %arg) #0 {
+bb:
+  indirectbr i8* undef, [label %bb85, label %bb206]
+
+bb85:                                             ; preds = %bb222, %bb85, %bb
+  store i8* blockaddress(@fp_dh_f870bf31fd8ffe068450366e3f05389a, %bb206), i8** undef, align 4
+  indirectbr i8* undef, [label %bb439, label %bb85]
+
+bb206:                                            ; preds = %bb
+  %tmp = getelementptr [499 x i32]* @fp_dh_36985b17790d59a27994eaab5dcb00ee, i32 0, i32 undef
+  %tmp207 = load i32* %tmp
+  %tmp208 = add i32 %tmp207, 1
+  %tmp209 = inttoptr i32 %tmp208 to i8*
+  indirectbr i8* %tmp209, [label %bb213]
+
+bb213:                                            ; preds = %bb206
+  %tmp214 = load i32* @fp_dh_18716afa4a5354de0a302c8edb3b0ee1, align 4
+  %tmp215 = load i8** @fp_dh_20a33cdeefab8f4c8887e82766cb9dcb, align 4
+  %tmp216 = urem i32 -717428541, %tmp214
+  %tmp217 = getelementptr i8* %tmp215, i32 %tmp216
+  %tmp218 = bitcast i8* %tmp217 to i32*
+  %tmp219 = load i32* %tmp218, align 4
+  store i32 %tmp219, i32* undef, align 4
+  %tmp220 = select i1 false, i32 359373646, i32 1677237955
+  %tmp221 = add i32 %tmp220, 0
+  indirectbr i8* undef, [label %bb432, label %bb222]
+
+bb222:                                            ; preds = %bb213
+  %tmp224 = load i32* undef, align 4
+  %tmp225 = load i32* undef, align 4
+  %tmp226 = xor i32 %tmp225, %tmp224
+  %tmp227 = shl i32 %tmp226, 1
+  %tmp228 = and i32 %tmp227, -2048880334
+  %tmp229 = sub i32 0, %tmp228
+  %tmp230 = add i32 0, %tmp229
+  %tmp231 = xor i32 %tmp230, 1059356227
+  %tmp232 = mul i32 %tmp231, 1603744721
+  %tmp233 = urem i32 %tmp232, 259
+  %tmp234 = getelementptr [259 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 2039075) to [259 x i8]*), i32 0, i32 %tmp233
+  %tmp235 = load i8* %tmp234, align 1
+  %tmp236 = add i32 %tmp233, 2
+  %tmp237 = getelementptr [264 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 3388166) to [264 x i8]*), i32 0, i32 %tmp236
+  %tmp238 = load i8* %tmp237, align 1
+  %tmp239 = getelementptr [265 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 1325165) to [265 x i8]*), i32 0, i32 0
+  %tmp240 = load i8* %tmp239, align 1
+  %tmp241 = add i32 %tmp233, 6
+  %tmp242 = trunc i32 %tmp241 to i8
+  %tmp243 = mul i8 %tmp242, -3
+  %tmp244 = add i8 %tmp243, 3
+  %tmp245 = mul i8 %tmp242, -6
+  %tmp246 = and i8 %tmp245, 6
+  %tmp247 = sub i8 0, %tmp246
+  %tmp248 = add i8 %tmp244, %tmp247
+  %tmp249 = load i8* undef, align 1
+  %tmp250 = xor i8 %tmp235, 17
+  %tmp251 = xor i8 %tmp250, %tmp238
+  %tmp252 = xor i8 %tmp251, %tmp240
+  %tmp253 = xor i8 %tmp252, %tmp249
+  %tmp254 = xor i8 %tmp253, %tmp248
+  %tmp255 = zext i8 %tmp254 to i16
+  %tmp256 = shl nuw i16 %tmp255, 8
+  %tmp257 = load i8* null, align 1
+  %tmp258 = load i32* @fp_dh_18716afa4a5354de0a302c8edb3b0ee1, align 4
+  %tmp259 = load i8** @fp_dh_20a33cdeefab8f4c8887e82766cb9dcb, align 4
+  %tmp260 = urem i32 -717428541, %tmp258
+  %tmp261 = getelementptr i8* %tmp259, i32 %tmp260
+  %tmp262 = bitcast i8* %tmp261 to i32*
+  %tmp263 = load i32* %tmp262, align 4
+  %tmp264 = xor i32 %tmp263, 0
+  %tmp265 = shl i32 %tmp264, 1
+  %tmp266 = and i32 %tmp265, -1312119832
+  %tmp267 = sub i32 0, %tmp266
+  %tmp268 = add i32 0, %tmp267
+  %tmp269 = xor i32 %tmp268, 623994670
+  %tmp270 = mul i32 %tmp269, 1603744721
+  %tmp271 = urem i32 %tmp270, 259
+  %tmp274 = add i32 %tmp271, 3
+  %tmp275 = getelementptr [265 x i8]* bitcast (i8* getelementptr inbounds ([5419648 x i8]* @fp_dh_9d93c897906e39883c58b034c8e786b2, i32 0, i32 1325165) to [265 x i8]*), i32 0, i32 %tmp274
+  %tmp276 = load i8* %tmp275, align 1
+  %tmp277 = add i32 %tmp271, 6
+  %tmp278 = trunc i32 %tmp277 to i8
+  %tmp279 = mul i8 %tmp278, -3
+  %tmp280 = add i8 %tmp279, 31
+  %tmp281 = add i8 %tmp280, 0
+  %tmp282 = xor i8 %tmp257, 13
+  %tmp283 = xor i8 %tmp282, 0
+  %tmp284 = xor i8 %tmp283, 0
+  %tmp285 = xor i8 %tmp284, %tmp276
+  %tmp286 = xor i8 %tmp285, %tmp281
+  %tmp287 = zext i8 %tmp286 to i16
+  %tmp288 = or i16 %tmp287, %tmp256
+  %tmp289 = xor i16 %tmp288, 14330
+  %tmp290 = add i16 0, %tmp289
+  %tmp291 = add i16 %tmp290, -14330
+  %tmp292 = zext i16 %tmp291 to i32
+  %tmp293 = add i16 %tmp290, -14330
+  %tmp294 = lshr i16 %tmp293, 12
+  %tmp295 = zext i16 %tmp294 to i32
+  %tmp296 = sub i32 0, %tmp295
+  %tmp297 = xor i32 %tmp296, 16
+  %tmp298 = add i32 0, %tmp297
+  %tmp299 = and i32 %tmp298, 31
+  %tmp300 = and i32 %tmp292, 30864
+  %tmp301 = shl i32 %tmp300, %tmp299
+  %tmp302 = xor i32 0, %tmp301
+  %tmp303 = add i32 0, %tmp302
+  %tmp304 = and i32 %tmp298, 31
+  %tmp305 = and i32 %tmp303, 25568
+  %tmp306 = lshr i32 %tmp305, %tmp304
+  %tmp307 = xor i32 0, %tmp306
+  %tmp308 = add i32 0, %tmp307
+  %tmp309 = trunc i32 %tmp308 to i16
+  %tmp310 = shl i16 %tmp309, 1
+  %tmp311 = and i16 %tmp310, -4648
+  %tmp312 = shl i16 %tmp309, 1
+  %tmp313 = and i16 %tmp312, 4646
+  %tmp314 = xor i16 %tmp311, 17700
+  %tmp315 = xor i16 %tmp313, 17700
+  %tmp316 = add i16 %tmp314, %tmp315
+  %tmp317 = and i16 %tmp314, %tmp315
+  %tmp318 = shl nuw i16 %tmp317, 1
+  %tmp319 = sub i16 0, %tmp318
+  %tmp320 = add i16 %tmp316, %tmp319
+  %tmp321 = and i16 %tmp320, 29906
+  %tmp322 = xor i16 %tmp309, 14953
+  %tmp323 = add i16 0, %tmp322
+  %tmp324 = sub i16 0, %tmp321
+  %tmp325 = xor i16 %tmp324, %tmp323
+  %tmp326 = add i16 0, %tmp325
+  %tmp327 = add i32 %tmp221, 1161362661
+  %tmp333 = icmp eq i16 %tmp326, 14953
+  %tmp334 = add i32 %tmp327, -1456704142
+  %tmp335 = zext i1 %tmp333 to i32
+  %tmp336 = add i32 %tmp334, %tmp335
+  %tmp337 = getelementptr [499 x i32]* @fp_dh_36985b17790d59a27994eaab5dcb00ee, i32 0, i32 %tmp336
+  %tmp338 = load i32* %tmp337
+  %tmp339 = add i32 %tmp338, 1
+  %tmp340 = inttoptr i32 %tmp339 to i8*
+  indirectbr i8* %tmp340, [label %bb85, label %bb439]
+
+bb432:                                            ; preds = %bb432, %bb213
+  %tmp433 = phi i32 [ %tmp221, %bb213 ], [ %tmp433, %bb432 ]
+  %tmp434 = add i32 %tmp433, 1022523279
+  %tmp435 = getelementptr [499 x i32]* @fp_dh_36985b17790d59a27994eaab5dcb00ee, i32 0, i32 %tmp434
+  %tmp436 = load i32* %tmp435
+  %tmp437 = add i32 %tmp436, 1
+  %tmp438 = inttoptr i32 %tmp437 to i8*
+  indirectbr i8* %tmp438, [label %bb432]
+
+bb439:                                            ; preds = %bb222, %bb85
+  ret void
+}
+
+attributes #0 = { nounwind ssp "less-precise-fpmad"="false" "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
diff --git a/test/CodeGen/X86/rot16.ll b/test/CodeGen/X86/rot16.ll
index 0293f4e..6d7c702 100644
--- a/test/CodeGen/X86/rot16.ll
+++ b/test/CodeGen/X86/rot16.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s
 
 define i16 @foo(i16 %x, i16 %y, i16 %z) nounwind readnone {
 entry:
diff --git a/test/CodeGen/X86/rotate3.ll b/test/CodeGen/X86/rotate3.ll
new file mode 100644
index 0000000..b92f7c2
--- /dev/null
+++ b/test/CodeGen/X86/rotate3.ll
@@ -0,0 +1,76 @@
+; Check that (or (shl x, y), (srl x, (sub 32, y))) is folded into (rotl x, y)
+; and (or (shl x, (sub 32, y)), (srl x, r)) into (rotr x, y) even if the
+; argument is zero extended. Fix for PR16726.
+
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+define zeroext i8 @rolbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i8 %x_arg to i32
+  %tmp3 = shl i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 8, %nBits_arg
+  %tmp10 = lshr i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i8
+  ret i8 %tmp12
+}
+; CHECK:    rolb %cl, %{{[a-z0-9]+}}
+
+
+define zeroext i8 @rorbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i8 %x_arg to i32
+  %tmp3 = lshr i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 8, %nBits_arg
+  %tmp10 = shl i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i8
+  ret i8 %tmp12
+}
+; CHECK:    rorb %cl, %{{[a-z0-9]+}}
+
+define zeroext i16 @rolword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i16 %x_arg to i32
+  %tmp3 = shl i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 16, %nBits_arg
+  %tmp10 = lshr i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i16
+  ret i16 %tmp12
+}
+; CHECK:    rolw %cl, %{{[a-z0-9]+}}
+
+define zeroext i16 @rorword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i16 %x_arg to i32
+  %tmp3 = lshr i32 %tmp1, %nBits_arg
+  %tmp8 = sub i32 16, %nBits_arg
+  %tmp10 = shl i32 %tmp1, %tmp8
+  %tmp11 = or i32 %tmp3, %tmp10
+  %tmp12 = trunc i32 %tmp11 to i16
+  ret i16 %tmp12
+}
+; CHECK:    rorw %cl, %{{[a-z0-9]+}}
+
+define i64 @roldword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i32 %x_arg to i64
+  %tmp3 = shl i64 %tmp1, %nBits_arg
+  %tmp8 = sub i64 32, %nBits_arg
+  %tmp10 = lshr i64 %tmp1, %tmp8
+  %tmp11 = or i64 %tmp3, %tmp10
+  ret i64 %tmp11
+}
+; CHECK:    roll %cl, %{{[a-z0-9]+}}
+
+define zeroext i64 @rordword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
+entry:
+  %tmp1 = zext i32 %x_arg to i64
+  %tmp3 = lshr i64 %tmp1, %nBits_arg
+  %tmp8 = sub i64 32, %nBits_arg
+  %tmp10 = shl i64 %tmp1, %tmp8
+  %tmp11 = or i64 %tmp3, %tmp10
+  ret i64 %tmp11
+}
+; CHECK:    rorl %cl, %{{[a-z0-9]+}}
diff --git a/test/CodeGen/X86/rotate4.ll b/test/CodeGen/X86/rotate4.ll
new file mode 100644
index 0000000..5372612
--- /dev/null
+++ b/test/CodeGen/X86/rotate4.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=generic | FileCheck %s
+
+; Check that we recognize this idiom for rotation too:
+;    a << (b & (OpSize-1)) | a >> ((0 - b) & (OpSize-1))
+
+define i32 @rotate_left_32(i32 %a, i32 %b) {
+; CHECK-LABEL: rotate_left_32:
+; CHECK-NOT: and
+; CHECK: roll
+entry:
+  %and = and i32 %b, 31
+  %shl = shl i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = lshr i32 %a, %and3
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i32 @rotate_right_32(i32 %a, i32 %b) {
+; CHECK-LABEL: rotate_right_32:
+; CHECK-NOT: and
+; CHECK: rorl
+entry:
+  %and = and i32 %b, 31
+  %shl = lshr i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = shl i32 %a, %and3
+  %or = or i32 %shl, %shr
+  ret i32 %or
+}
+
+define i64 @rotate_left_64(i64 %a, i64 %b) {
+; CHECK-LABEL: rotate_left_64:
+; CHECK-NOT: and
+; CHECK: rolq
+entry:
+  %and = and i64 %b, 63
+  %shl = shl i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = lshr i64 %a, %and3
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+define i64 @rotate_right_64(i64 %a, i64 %b) {
+; CHECK-LABEL: rotate_right_64:
+; CHECK-NOT: and
+; CHECK: rorq
+entry:
+  %and = and i64 %b, 63
+  %shl = lshr i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = shl i64 %a, %and3
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+; Also check mem operand.
+
+define void @rotate_left_m32(i32 *%pa, i32 %b) {
+; CHECK-LABEL: rotate_left_m32:
+; CHECK-NOT: and
+; CHECK: roll
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i32* %pa, align 16
+  %and = and i32 %b, 31
+  %shl = shl i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = lshr i32 %a, %and3
+  %or = or i32 %shl, %shr
+  store i32 %or, i32* %pa, align 32
+  ret void
+}
+
+define void @rotate_right_m32(i32 *%pa, i32 %b) {
+; CHECK-LABEL: rotate_right_m32:
+; CHECK-NOT: and
+; CHECK: rorl
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i32* %pa, align 16
+  %and = and i32 %b, 31
+  %shl = lshr i32 %a, %and
+  %0 = sub i32 0, %b
+  %and3 = and i32 %0, 31
+  %shr = shl i32 %a, %and3
+  %or = or i32 %shl, %shr
+  store i32 %or, i32* %pa, align 32
+  ret void
+}
+
+define void @rotate_left_m64(i64 *%pa, i64 %b) {
+; CHECK-LABEL: rotate_left_m64:
+; CHECK-NOT: and
+; CHECK: rolq
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i64* %pa, align 16
+  %and = and i64 %b, 63
+  %shl = shl i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = lshr i64 %a, %and3
+  %or = or i64 %shl, %shr
+  store i64 %or, i64* %pa, align 64
+  ret void
+}
+
+define void @rotate_right_m64(i64 *%pa, i64 %b) {
+; CHECK-LABEL: rotate_right_m64:
+; CHECK-NOT: and
+; CHECK: rorq
+; no store:
+; CHECK-NOT: mov
+entry:
+  %a = load i64* %pa, align 16
+  %and = and i64 %b, 63
+  %shl = lshr i64 %a, %and
+  %0 = sub i64 0, %b
+  %and3 = and i64 %0, 63
+  %shr = shl i64 %a, %and3
+  %or = or i64 %shl, %shr
+  store i64 %or, i64* %pa, align 64
+  ret void
+}
diff --git a/test/CodeGen/X86/saddo-redundant-add.ll b/test/CodeGen/X86/saddo-redundant-add.ll
new file mode 100644
index 0000000..c56c686
--- /dev/null
+++ b/test/CodeGen/X86/saddo-redundant-add.ll
@@ -0,0 +1,34 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin | FileCheck %s
+
+define void @redundant_add(i64 %n) {
+; Check that we don't create two additions for the sadd.with.overflow.
+; CHECK-LABEL: redundant_add
+; CHECK-NOT:  leaq
+; CHECK-NOT:  addq
+; CHECK:      incq
+; CHECK-NEXT: jno
+entry:
+  br label %exit_check
+
+exit_check:
+  %i = phi i64 [ 0, %entry ], [ %i.next, %loop ]
+  %c = icmp slt i64 %i, %n
+  br i1 %c, label %loop, label %exit
+
+loop:
+  %i.o = tail call { i64, i1 } @llvm.sadd.with.overflow.i64(i64 %i, i64 1)
+  %i.next = extractvalue { i64, i1 } %i.o, 0
+  %o = extractvalue { i64, i1 } %i.o, 1
+  br i1 %o, label %overflow, label %exit_check
+
+exit:
+  ret void
+
+overflow:
+  tail call void @llvm.trap()
+  unreachable
+}
+
+declare { i64, i1 } @llvm.sadd.with.overflow.i64(i64, i64)
+declare void @llvm.trap()
+
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index 08a98ef..c02152b 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -4,6 +4,7 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
@@ -12,16 +13,14 @@
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -filetype=obj
 ; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -filetype=obj
 
 ; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris -segmented-stacks 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
-; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks 2> %t.log
-; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-MinGW
 ; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd -segmented-stacks 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X32-FreeBSD
 
 ; X64-Solaris: Segmented stacks not supported on this platform
-; X64-MinGW: Segmented stacks not supported on this platform
 ; X32-FreeBSD: Segmented stacks not supported on FreeBSD i386
 
 ; Just to prevent the alloca from being optimized away
@@ -83,6 +82,16 @@ define void @test_basic() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL:       test_basic:
+
+; X64-MinGW:       cmpq %gs:40, %rsp
+; X64-MinGW-NEXT:  ja      .LBB0_2
+
+; X64-MinGW:       movabsq $72, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD-LABEL:       test_basic:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
@@ -145,6 +154,17 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL: test_nested:
+; X64-MinGW:       cmpq %gs:40, %rsp
+; X64-MinGW-NEXT:  ja      .LBB1_2
+
+; X64-MinGW:       movq %r10, %rax
+; X64-MinGW-NEXT:  movabsq $0, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+; X64-MinGW-NEXT:  movq %rax, %r10
+
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
 ; X64-FreeBSD-NEXT:  ja      .LBB1_2
 
@@ -208,6 +228,16 @@ define void @test_large() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL: test_large:
+; X64-MinGW:       leaq -40040(%rsp), %r11
+; X64-MinGW-NEXT:  cmpq %gs:40, %r11
+; X64-MinGW-NEXT:  ja      .LBB2_2
+
+; X64-MinGW:       movabsq $40040, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
 ; X64-FreeBSD-NEXT:  cmpq %fs:24, %r11
 ; X64-FreeBSD-NEXT:  ja      .LBB2_2
@@ -275,6 +305,16 @@ define fastcc void @test_fastcc() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL:       test_fastcc:
+
+; X64-MinGW:       cmpq %gs:40, %rsp
+; X64-MinGW-NEXT:  ja      .LBB3_2
+
+; X64-MinGW:       movabsq $72, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD-LABEL:       test_fastcc:
 
 ; X64-FreeBSD:       cmpq %fs:24, %rsp
@@ -348,6 +388,17 @@ define fastcc void @test_fastcc_large() {
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
+; X64-MinGW-LABEL:       test_fastcc_large:
+
+; X64-MinGW:       leaq -40040(%rsp), %r11
+; X64-MinGW-NEXT:  cmpq %gs:40, %r11
+; X64-MinGW-NEXT:  ja      .LBB4_2
+
+; X64-MinGW:       movabsq $40040, %r10
+; X64-MinGW-NEXT:  movabsq $32, %r11
+; X64-MinGW-NEXT:  callq __morestack
+; X64-MinGW-NEXT:  retq
+
 ; X64-FreeBSD-LABEL:       test_fastcc_large:
 
 ; X64-FreeBSD:       leaq -40008(%rsp), %r11
diff --git a/test/CodeGen/X86/setjmp-spills.ll b/test/CodeGen/X86/setjmp-spills.ll
new file mode 100644
index 0000000..c35caae
--- /dev/null
+++ b/test/CodeGen/X86/setjmp-spills.ll
@@ -0,0 +1,141 @@
+; RUN: llc < %s -mtriple=i386-linux | FileCheck %s -check-prefix=X86-32
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+
+declare i32 @get_val()
+declare void @use_val(i32)
+declare i1 @setjmp()
+declare void @longjmp()
+declare void @personality()
+
+
+; Test that llc avoids reusing spill slots in functions that call
+; setjmp(), whether they use "call" or "invoke" for calling setjmp()
+; (PR18244).
+
+define void @setjmp_caller() {
+; X86-32-LABEL: setjmp_caller:
+; X86-64-LABEL: setjmp_caller:
+; This code keeps enough variables live across the setjmp() call that
+; they don't all fit in registers and the compiler will allocate a
+; spill slot.
+  %a1 = call i32 @get_val()
+  %a2 = call i32 @get_val()
+  %a3 = call i32 @get_val()
+  %a4 = call i32 @get_val()
+  %a5 = call i32 @get_val()
+  %a6 = call i32 @get_val()
+  %a7 = call i32 @get_val()
+  %a8 = call i32 @get_val()
+; X86-32: movl %eax, [[SPILL_SLOT:[0-9]+]](%esp)
+; X86-32: calll get_val
+; X86-64: movl %eax, [[SPILL_SLOT:[0-9]+]](%rsp)
+; X86-64: callq get_val
+
+  %setjmp_result = call i1 @setjmp() returns_twice
+  br i1 %setjmp_result, label %second, label %first
+; X86-32: calll setjmp
+; X86-64: callq setjmp
+
+; Again, keep enough variables live that they need spill slots.  Since
+; this function calls a returns_twice function (setjmp()), the
+; compiler should not reuse the spill slots.  longjmp() can return to
+; where the first spill slots were still live.
+first:
+  %b1 = call i32 @get_val()
+  %b2 = call i32 @get_val()
+  %b3 = call i32 @get_val()
+  %b4 = call i32 @get_val()
+  %b5 = call i32 @get_val()
+  %b6 = call i32 @get_val()
+  %b7 = call i32 @get_val()
+  %b8 = call i32 @get_val()
+  call void @use_val(i32 %b1)
+  call void @use_val(i32 %b2)
+  call void @use_val(i32 %b3)
+  call void @use_val(i32 %b4)
+  call void @use_val(i32 %b5)
+  call void @use_val(i32 %b6)
+  call void @use_val(i32 %b7)
+  call void @use_val(i32 %b8)
+  call void @longjmp()
+  unreachable
+; X86-32-NOT: movl {{.*}}, [[SPILL_SLOT]](%esp)
+; X86-64-NOT: movl {{.*}}, [[SPILL_SLOT]](%rsp)
+
+second:
+  call void @use_val(i32 %a1)
+  call void @use_val(i32 %a2)
+  call void @use_val(i32 %a3)
+  call void @use_val(i32 %a4)
+  call void @use_val(i32 %a5)
+  call void @use_val(i32 %a6)
+  call void @use_val(i32 %a7)
+  call void @use_val(i32 %a8)
+  ret void
+}
+
+
+; This is the same as above, but using "invoke" rather than "call" to
+; call setjmp().
+
+define void @setjmp_invoker() {
+; X86-32-LABEL: setjmp_invoker:
+; X86-64-LABEL: setjmp_invoker:
+  %a1 = call i32 @get_val()
+  %a2 = call i32 @get_val()
+  %a3 = call i32 @get_val()
+  %a4 = call i32 @get_val()
+  %a5 = call i32 @get_val()
+  %a6 = call i32 @get_val()
+  %a7 = call i32 @get_val()
+  %a8 = call i32 @get_val()
+; X86-32: movl %eax, [[SPILL_SLOT:[0-9]+]](%esp)
+; X86-32: calll get_val
+; X86-64: movl %eax, [[SPILL_SLOT:[0-9]+]](%rsp)
+; X86-64: callq get_val
+
+  %setjmp_result = invoke i1 @setjmp() returns_twice
+      to label %cont unwind label %lpad
+; X86-32: calll setjmp
+; X86-64: callq setjmp
+
+cont:
+  br i1 %setjmp_result, label %second, label %first
+
+lpad:
+  %lp = landingpad { i8*, i32 } personality void ()* @personality cleanup
+  unreachable
+
+first:
+  %b1 = call i32 @get_val()
+  %b2 = call i32 @get_val()
+  %b3 = call i32 @get_val()
+  %b4 = call i32 @get_val()
+  %b5 = call i32 @get_val()
+  %b6 = call i32 @get_val()
+  %b7 = call i32 @get_val()
+  %b8 = call i32 @get_val()
+  call void @use_val(i32 %b1)
+  call void @use_val(i32 %b2)
+  call void @use_val(i32 %b3)
+  call void @use_val(i32 %b4)
+  call void @use_val(i32 %b5)
+  call void @use_val(i32 %b6)
+  call void @use_val(i32 %b7)
+  call void @use_val(i32 %b8)
+  call void @longjmp()
+  unreachable
+; X86-32-NOT: movl {{.*}}, [[SPILL_SLOT]](%esp)
+; X86-64-NOT: movl {{.*}}, [[SPILL_SLOT]](%rsp)
+
+second:
+  call void @use_val(i32 %a1)
+  call void @use_val(i32 %a2)
+  call void @use_val(i32 %a3)
+  call void @use_val(i32 %a4)
+  call void @use_val(i32 %a5)
+  call void @use_val(i32 %a6)
+  call void @use_val(i32 %a7)
+  call void @use_val(i32 %a8)
+  ret void
+}
diff --git a/test/CodeGen/X86/shift-combine-crash.ll b/test/CodeGen/X86/shift-combine-crash.ll
new file mode 100644
index 0000000..a69a907
--- /dev/null
+++ b/test/CodeGen/X86/shift-combine-crash.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 > /dev/null
+
+; Verify that DAGCombiner doesn't crash with an assertion failure in the
+; attempt to cast a ISD::UNDEF node to a ConstantSDNode.
+
+; During type legalization, the vector shift operation in function @test1 is
+; split into two legal shifts that work on <2 x i64> elements.
+; The first shift of the legalized sequence would be a shift by all undefs.
+; DAGCombiner will then try to simplify the vector shift and check if the
+; vector of shift counts is a splat. Make sure that llc doesn't crash
+; at that stage.
+
+
+define <4 x i64> @test1(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 1, i64 2>
+  ret <4 x i64> %shl
+}
+
+; Also, verify that DAGCombiner doesn't crash when trying to combine shifts
+; with different combinations of undef elements in the vector shift count.
+
+define <4 x i64> @test2(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 2, i64 3, i64 undef, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test3(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 2, i64 undef, i64 3, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test4(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 2, i64 undef, i64 3>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test5(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 2, i64 undef, i64 undef, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test6(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 3, i64 undef>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test7(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 undef, i64 3>
+  ret <4 x i64> %shl
+}
+
+define <4 x i64> @test8(<4 x i64> %A) {
+  %shl = shl <4 x i64> %A, <i64 undef, i64 undef, i64 undef, i64 undef>
+  ret <4 x i64> %shl
+}
+
+
diff --git a/test/CodeGen/X86/shift-double.ll b/test/CodeGen/X86/shift-double.ll
index 8d2b290..fd4ba81 100644
--- a/test/CodeGen/X86/shift-double.ll
+++ b/test/CodeGen/X86/shift-double.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -x86-asm-syntax=intel | \
+; RUN: llc < %s -march=x86 -mcpu=generic -x86-asm-syntax=intel | \
 ; RUN:   grep "sh[lr]d" | count 5
 
 define i64 @test1(i64 %X, i8 %C) {
diff --git a/test/CodeGen/X86/shift-pcmp.ll b/test/CodeGen/X86/shift-pcmp.ll
new file mode 100644
index 0000000..365c731
--- /dev/null
+++ b/test/CodeGen/X86/shift-pcmp.ll
@@ -0,0 +1,30 @@
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=generic -march=x86-64 -mattr=+avx | FileCheck %s
+
+define <8 x i16> @foo(<8 x i16> %a, <8 x i16> %b) {
+; CHECK: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-NEXT: .short	      32
+; CHECK-LABEL: {{^_?foo:}}
+; CHECK-NOT: psll
+entry:
+  %icmp = icmp eq <8 x i16> %a, %b
+  %zext = zext <8 x i1> %icmp to <8 x i16>
+  %shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %shl
+}
+
+; Don't fail with an assert due to an undef in the buildvector
+define <8 x i16> @bar(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: bar
+entry:
+  %icmp = icmp eq <8 x i16> %a, %b
+  %zext = zext <8 x i1> %icmp to <8 x i16>
+  %shl = shl nuw nsw <8 x i16> %zext, <i16 5, i16 undef, i16 5, i16 5, i16 5, i16 5, i16 5, i16 5>
+  ret <8 x i16> %shl
+}
diff --git a/test/CodeGen/X86/shl_undef.ll b/test/CodeGen/X86/shl_undef.ll
index 54b74cc..705af5b 100644
--- a/test/CodeGen/X86/shl_undef.ll
+++ b/test/CodeGen/X86/shl_undef.ll
@@ -1,15 +1,17 @@
-; RUN: llc < %s -O1 -mtriple=i386-apple-darwin | FileCheck %s
+; RUN: llc < %s -O1 -mtriple=i386-apple-darwin -x86-asm-syntax=intel | FileCheck %s
 ;
 ; Interesting test case where %tmp1220 = xor i32 %tmp862, %tmp592 and
 ; %tmp1676 = xor i32 %tmp1634, %tmp1530 have zero demanded bits after
 ; DAGCombiner optimization pass.  These are changed to undef and in turn
 ; the successor shl(s) become shl undef, 1.  This pattern then matches
-; shl x, 1 -> add x, x.  add undef, undef doesn't guarentee the low
+; shl x, 1 -> add x, x.  add undef, undef doesn't guarantee the low
 ; order bit is zero and is incorrect.
 ;
 ; See rdar://9453156 and rdar://9487392.
 ;
 
+; Use intel syntax, or "shl" might hit "pushl".
+
 ; CHECK-NOT: shl
 define i32 @foo(i8* %a0, i32* %a2) nounwind {
 entry:
diff --git a/test/CodeGen/X86/shrink-compare.ll b/test/CodeGen/X86/shrink-compare.ll
index bb89201..fc7ee06 100644
--- a/test/CodeGen/X86/shrink-compare.ll
+++ b/test/CodeGen/X86/shrink-compare.ll
@@ -2,7 +2,7 @@
 
 declare void @bar()
 
-define void @test1(i32* nocapture %X) nounwind {
+define void @test1(i32* nocapture %X) nounwind minsize {
 entry:
   %tmp1 = load i32* %X, align 4
   %and = and i32 %tmp1, 255
@@ -19,7 +19,7 @@ if.end:
 ; CHECK: cmpb $47, (%{{rdi|rcx}})
 }
 
-define void @test2(i32 %X) nounwind {
+define void @test2(i32 %X) nounwind minsize {
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 47
@@ -35,7 +35,7 @@ if.end:
 ; CHECK: cmpb $47, %{{dil|cl}}
 }
 
-define void @test3(i32 %X) nounwind {
+define void @test3(i32 %X) nounwind minsize {
 entry:
   %and = and i32 %X, 255
   %cmp = icmp eq i32 %and, 255
@@ -70,7 +70,7 @@ lor.end:                                          ; preds = %lor.rhs, %entry
 @x = global { i8, i8, i8, i8, i8, i8, i8, i8 } { i8 1, i8 0, i8 0, i8 0, i8 1, i8 0, i8 0, i8 1 }, align 4
 
 ; PR16551
-define void @test5(i32 %X) nounwind {
+define void @test5(i32 %X) nounwind minsize {
 entry:
   %bf.load = load i56* bitcast ({ i8, i8, i8, i8, i8, i8, i8, i8 }* @x to i56*), align 4
   %bf.lshr = lshr i56 %bf.load, 32
diff --git a/test/CodeGen/X86/sibcall-5.ll b/test/CodeGen/X86/sibcall-5.ll
index c479030..c04af23 100644
--- a/test/CodeGen/X86/sibcall-5.ll
+++ b/test/CodeGen/X86/sibcall-5.ll
@@ -8,7 +8,7 @@
 define double @foo(double %a) nounwind readonly ssp {
 entry:
 ; X32-LABEL: foo:
-; X32: jmp _sin$stub
+; X32: jmp L_sin$stub
 
 ; X64-LABEL: foo:
 ; X64: jmp _sin
@@ -18,7 +18,7 @@ entry:
 
 define float @bar(float %a) nounwind readonly ssp {
 ; X32-LABEL: bar:
-; X32: jmp _sinf$stub
+; X32: jmp L_sinf$stub
 
 ; X64-LABEL: bar:
 ; X64: jmp _sinf
@@ -27,6 +27,11 @@ entry:
   ret float %0
 }
 
+; X32-LABEL: L_sin$stub:
+; X32-NEXT:   .indirect_symbol        _sin
+; X32-LABEL: L_sinf$stub:
+; X32-NEXT:   .indirect_symbol        _sinf
+
 declare float @sinf(float) nounwind readonly
 
 declare double @sin(double) nounwind readonly
diff --git a/test/CodeGen/X86/sibcall.ll b/test/CodeGen/X86/sibcall.ll
index 589e9ec..28fc626 100644
--- a/test/CodeGen/X86/sibcall.ll
+++ b/test/CodeGen/X86/sibcall.ll
@@ -247,11 +247,11 @@ entry:
 define void @t15(%struct.foo* noalias sret %agg.result) nounwind  {
 ; 32-LABEL: t15:
 ; 32: calll {{_?}}f
-; 32: ret $4
+; 32: retl $4
 
 ; 64-LABEL: t15:
 ; 64: callq {{_?}}f
-; 64: ret
+; 64: retq
   tail call fastcc void @f(%struct.foo* noalias sret %agg.result) nounwind
   ret void
 }
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith-2.ll b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
new file mode 100644
index 0000000..600ee1b
--- /dev/null
+++ b/test/CodeGen/X86/sse-scalar-fp-arith-2.ll
@@ -0,0 +1,423 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+; Ensure that the backend selects SSE/AVX scalar fp instructions
+; from a packed fp instrution plus a vector insert.
+
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %a, %b
+  %2 = shufflevector <4 x float> %1, <4 x float> %a, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %a, %b
+  %2 = shufflevector <2 x double> %1, <2 x double> %a, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %b, %a
+  %2 = shufflevector <4 x float> %1, <4 x float> %b, <4 x i32> <i32 0, i32 5, i32 6, i32 7>
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %b, %a
+  %2 = shufflevector <2 x double> %1, <2 x double> %b, <2 x i32> <i32 0, i32 3>
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test3_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test3_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %a, %b
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test3_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test3_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test3_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %a, %b
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test3_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test4_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fadd <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fsub <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fmul <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test4_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = fdiv <4 x float> %b, %a
+  %2 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %b, <4 x float> %1
+  ret <4 x float> %2
+}
+
+; CHECK-LABEL: test4_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test4_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fadd <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fsub <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fmul <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test4_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = fdiv <2 x double> %b, %a
+  %2 = select <2 x i1> <i1 false, i1 true>, <2 x double> %b, <2 x double> %1
+  ret <2 x double> %2
+}
+
+; CHECK-LABEL: test4_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/sse-scalar-fp-arith.ll b/test/CodeGen/X86/sse-scalar-fp-arith.ll
new file mode 100644
index 0000000..3949a83
--- /dev/null
+++ b/test/CodeGen/X86/sse-scalar-fp-arith.ll
@@ -0,0 +1,310 @@
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mattr=-sse4.1 -mcpu=corei7 < %s | FileCheck -check-prefix=CHECK -check-prefix=SSE2 %s
+; RUN: llc -mtriple=x86_64-pc-linux -mcpu=corei7-avx < %s | FileCheck -check-prefix=CHECK -check-prefix=AVX %s
+
+; Ensure that the backend no longer emits unnecessary vector insert
+; instructions immediately after SSE scalar fp instructions
+; like addss or mulss.
+
+
+define <4 x float> @test_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %3 = insertelement <4 x float> %a, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_add_ss
+; SSE2: addss   %xmm1, %xmm0
+; AVX: vaddss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %a, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_sub_ss
+; SSE2: subss   %xmm1, %xmm0
+; AVX: vsubss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+define <4 x float> @test_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %3 = insertelement <4 x float> %a, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_mul_ss
+; SSE2: mulss   %xmm1, %xmm0
+; AVX: vmulss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %a, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_div_ss
+; SSE2: divss   %xmm1, %xmm0
+; AVX: vdivss   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %add = fadd double %2, %1
+  %3 = insertelement <2 x double> %a, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_add_sd
+; SSE2: addsd   %xmm1, %xmm0
+; AVX: vaddsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %a, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_sub_sd
+; SSE2: subsd   %xmm1, %xmm0
+; AVX: vsubsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %mul = fmul double %2, %1
+  %3 = insertelement <2 x double> %a, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_mul_sd
+; SSE2: mulsd   %xmm1, %xmm0
+; AVX: vmulsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %b, i32 0
+  %2 = extractelement <2 x double> %a, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %a, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test_div_sd
+; SSE2: divsd   %xmm1, %xmm0
+; AVX: vdivsd   %xmm1, %xmm0, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test2_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %add = fadd float %1, %2
+  %3 = insertelement <4 x float> %b, float %add, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_add_ss
+; SSE2: addss   %xmm0, %xmm1
+; AVX: vaddss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %sub = fsub float %2, %1
+  %3 = insertelement <4 x float> %b, float %sub, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_sub_ss
+; SSE2: subss   %xmm0, %xmm1
+; AVX: vsubss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %mul = fmul float %1, %2
+  %3 = insertelement <4 x float> %b, float %mul, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_mul_ss
+; SSE2: mulss   %xmm0, %xmm1
+; AVX: vmulss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test2_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %a, i32 0
+  %2 = extractelement <4 x float> %b, i32 0
+  %div = fdiv float %2, %1
+  %3 = insertelement <4 x float> %b, float %div, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test2_div_ss
+; SSE2: divss   %xmm0, %xmm1
+; AVX: vdivss   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <2 x double> @test2_add_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %add = fadd double %1, %2
+  %3 = insertelement <2 x double> %b, double %add, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_add_sd
+; SSE2: addsd   %xmm0, %xmm1
+; AVX: vaddsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_sub_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %sub = fsub double %2, %1
+  %3 = insertelement <2 x double> %b, double %sub, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_sub_sd
+; SSE2: subsd   %xmm0, %xmm1
+; AVX: vsubsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_mul_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %mul = fmul double %1, %2
+  %3 = insertelement <2 x double> %b, double %mul, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_mul_sd
+; SSE2: mulsd   %xmm0, %xmm1
+; AVX: vmulsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <2 x double> @test2_div_sd(<2 x double> %a, <2 x double> %b) {
+  %1 = extractelement <2 x double> %a, i32 0
+  %2 = extractelement <2 x double> %b, i32 0
+  %div = fdiv double %2, %1
+  %3 = insertelement <2 x double> %b, double %div, i32 0
+  ret <2 x double> %3
+}
+
+; CHECK-LABEL: test2_div_sd
+; SSE2: divsd   %xmm0, %xmm1
+; AVX: vdivsd   %xmm0, %xmm1, %xmm0
+; CHECK-NOT: movsd
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_add_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %add = fadd float %2, %1
+  %add2 = fadd float %2, %add
+  %3 = insertelement <4 x float> %a, float %add2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_add_ss
+; CHECK: addss
+; CHECK: addss
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_sub_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %sub = fsub float %2, %1
+  %sub2 = fsub float %2, %sub
+  %3 = insertelement <4 x float> %a, float %sub2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_sub_ss
+; CHECK: subss
+; CHECK: subss
+; CHECK-NOT: movss
+; CHECK: ret
+
+
+define <4 x float> @test_multiple_mul_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %mul = fmul float %2, %1
+  %mul2 = fmul float %2, %mul
+  %3 = insertelement <4 x float> %a, float %mul2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_mul_ss
+; CHECK: mulss
+; CHECK: mulss
+; CHECK-NOT: movss
+; CHECK: ret
+
+define <4 x float> @test_multiple_div_ss(<4 x float> %a, <4 x float> %b) {
+  %1 = extractelement <4 x float> %b, i32 0
+  %2 = extractelement <4 x float> %a, i32 0
+  %div = fdiv float %2, %1
+  %div2 = fdiv float %2, %div
+  %3 = insertelement <4 x float> %a, float %div2, i32 0
+  ret <4 x float> %3
+}
+
+; CHECK-LABEL: test_multiple_div_ss
+; CHECK: divss
+; CHECK: divss
+; CHECK-NOT: movss
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/sse1.ll b/test/CodeGen/X86/sse1.ll
index 47c6429..183297e 100644
--- a/test/CodeGen/X86/sse1.ll
+++ b/test/CodeGen/X86/sse1.ll
@@ -43,3 +43,17 @@ entry:
 ; CHECK-NOT: shufps	$16
 ; CHECK: ret
 }
+
+; We used to get stuck in type legalization for this example when lowering the
+; vselect. With SSE1 v4f32 is a legal type but v4i1 (or any vector integer type)
+; is not. We used to ping pong between splitting the vselect for the v4i
+; condition operand and widening the resulting vselect for the v4f32 result.
+; PR18036
+
+; CHECK-LABEL: vselect
+define <4 x float> @vselect(<4 x float>*%p, <4 x i32> %q) {
+entry:
+  %a1 = icmp eq <4 x i32> %q, zeroinitializer
+  %a14 = select <4 x i1> %a1, <4 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00, float 4.000000e+0> , <4 x float> zeroinitializer
+  ret <4 x float> %a14
+}
diff --git a/test/CodeGen/X86/sse2-blend.ll b/test/CodeGen/X86/sse2-blend.ll
index 1ac9832..c63ff72 100644
--- a/test/CodeGen/X86/sse2-blend.ll
+++ b/test/CodeGen/X86/sse2-blend.ll
@@ -1,22 +1,22 @@
 ; RUN: llc < %s -march=x86 -mcpu=yonah -mattr=+sse2,-sse4.1 | FileCheck %s
 
-; CHECK: vsel_float
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK-LABEL: vsel_float
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
 ; CHECK: ret
 define void@vsel_float(<4 x float>* %v1, <4 x float>* %v2) {
   %A = load <4 x float>* %v1
   %B = load <4 x float>* %v2
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+  %vsel = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
   store <4 x float > %vsel, <4 x float>* %v1
   ret void
 }
 
-; CHECK: vsel_i32
-; CHECK: pandn
-; CHECK: pand
-; CHECK: por
+; CHECK-LABEL: vsel_i32
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK-NOT: orps
 ; CHECK: ret
 define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
   %A = load <4 x i32>* %v1
@@ -27,7 +27,7 @@ define void@vsel_i32(<4 x i32>* %v1, <4 x i32>* %v2) {
 }
 
 ; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK: vsel_i64
+; CHECK-LABEL: vsel_i64
 ; CHECK: andnps
 ; CHECK: orps
 ; CHECK: ret
@@ -41,7 +41,7 @@ define void@vsel_i64(<2 x i64>* %v1, <2 x i64>* %v2) {
 }
 
 ; Without forcing instructions, fall back to the preferred PS domain.
-; CHECK: vsel_double
+; CHECK-LABEL: vsel_double
 ; CHECK: andnps
 ; CHECK: orps
 ; CHECK: ret
diff --git a/test/CodeGen/X86/sse2-intrinsics-x86.ll b/test/CodeGen/X86/sse2-intrinsics-x86.ll
index ff6c10b..cfc892d 100644
--- a/test/CodeGen/X86/sse2-intrinsics-x86.ll
+++ b/test/CodeGen/X86/sse2-intrinsics-x86.ll
@@ -710,3 +710,10 @@ define i32 @test_x86_sse2_ucomineq_sd(<2 x double> %a0, <2 x double> %a1) {
   ret i32 %res
 }
 declare i32 @llvm.x86.sse2.ucomineq.sd(<2 x double>, <2 x double>) nounwind readnone
+
+define void @test_x86_sse2_pause() {
+  ; CHECK: pause
+  tail call void @llvm.x86.sse2.pause()
+  ret void 
+}
+declare void @llvm.x86.sse2.pause() nounwind
diff --git a/test/CodeGen/X86/sse2-vector-shifts.ll b/test/CodeGen/X86/sse2-vector-shifts.ll
index 462def9..7c8d5e5 100644
--- a/test/CodeGen/X86/sse2-vector-shifts.ll
+++ b/test/CodeGen/X86/sse2-vector-shifts.ll
@@ -9,8 +9,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllw_1:
-; CHECK: psllw   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psllw   $0, %xmm0
+; CHECK: ret
 
 define <8 x i16> @test_sllw_2(<8 x i16> %InVec) {
 entry:
@@ -24,12 +24,12 @@ entry:
 
 define <8 x i16> @test_sllw_3(<8 x i16> %InVec) {
 entry:
-  %shl = shl <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = shl <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
 ; CHECK-LABEL: test_sllw_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psllw $15, %xmm0
 ; CHECK-NEXT: ret
 
 define <4 x i32> @test_slld_1(<4 x i32> %InVec) {
@@ -39,8 +39,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_slld_1:
-; CHECK: pslld   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: pslld   $0, %xmm0
+; CHECK: ret
 
 define <4 x i32> @test_slld_2(<4 x i32> %InVec) {
 entry:
@@ -54,12 +54,12 @@ entry:
 
 define <4 x i32> @test_slld_3(<4 x i32> %InVec) {
 entry:
-  %shl = shl <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+  %shl = shl <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
 ; CHECK-LABEL: test_slld_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: pslld $31, %xmm0
 ; CHECK-NEXT: ret
 
 define <2 x i64> @test_sllq_1(<2 x i64> %InVec) {
@@ -69,8 +69,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_sllq_1:
-; CHECK: psllq   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psllq   $0, %xmm0
+; CHECK: ret
 
 define <2 x i64> @test_sllq_2(<2 x i64> %InVec) {
 entry:
@@ -84,12 +84,12 @@ entry:
 
 define <2 x i64> @test_sllq_3(<2 x i64> %InVec) {
 entry:
-  %shl = shl <2 x i64> %InVec, <i64 64, i64 64>
+  %shl = shl <2 x i64> %InVec, <i64 63, i64 63>
   ret <2 x i64> %shl
 }
 
 ; CHECK-LABEL: test_sllq_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psllq $63, %xmm0
 ; CHECK-NEXT: ret
 
 ; SSE2 Arithmetic Shift
@@ -101,8 +101,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_sraw_1:
-; CHECK: psraw   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psraw   $0, %xmm0
+; CHECK: ret
 
 define <8 x i16> @test_sraw_2(<8 x i16> %InVec) {
 entry:
@@ -116,7 +116,7 @@ entry:
 
 define <8 x i16> @test_sraw_3(<8 x i16> %InVec) {
 entry:
-  %shl = ashr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = ashr <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
@@ -131,8 +131,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srad_1:
-; CHECK: psrad   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrad   $0, %xmm0
+; CHECK: ret
 
 define <4 x i32> @test_srad_2(<4 x i32> %InVec) {
 entry:
@@ -146,7 +146,7 @@ entry:
 
 define <4 x i32> @test_srad_3(<4 x i32> %InVec) {
 entry:
-  %shl = ashr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+  %shl = ashr <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
@@ -163,8 +163,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlw_1:
-; CHECK: psrlw   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrlw   $0, %xmm0
+; CHECK: ret
 
 define <8 x i16> @test_srlw_2(<8 x i16> %InVec) {
 entry:
@@ -178,12 +178,12 @@ entry:
 
 define <8 x i16> @test_srlw_3(<8 x i16> %InVec) {
 entry:
-  %shl = lshr <8 x i16> %InVec, <i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16, i16 16>
+  %shl = lshr <8 x i16> %InVec, <i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15, i16 15>
   ret <8 x i16> %shl
 }
 
 ; CHECK-LABEL: test_srlw_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psrlw $15, %xmm0
 ; CHECK-NEXT: ret
 
 define <4 x i32> @test_srld_1(<4 x i32> %InVec) {
@@ -193,8 +193,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srld_1:
-; CHECK: psrld   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrld   $0, %xmm0
+; CHECK: ret
 
 define <4 x i32> @test_srld_2(<4 x i32> %InVec) {
 entry:
@@ -208,12 +208,12 @@ entry:
 
 define <4 x i32> @test_srld_3(<4 x i32> %InVec) {
 entry:
-  %shl = lshr <4 x i32> %InVec, <i32 32, i32 32, i32 32, i32 32>
+  %shl = lshr <4 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31>
   ret <4 x i32> %shl
 }
 
 ; CHECK-LABEL: test_srld_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psrld $31, %xmm0
 ; CHECK-NEXT: ret
 
 define <2 x i64> @test_srlq_1(<2 x i64> %InVec) {
@@ -223,8 +223,8 @@ entry:
 }
 
 ; CHECK-LABEL: test_srlq_1:
-; CHECK: psrlq   $0, %xmm0
-; CHECK-NEXT: ret
+; CHECK-NOT: psrlq   $0, %xmm0
+; CHECK: ret
 
 define <2 x i64> @test_srlq_2(<2 x i64> %InVec) {
 entry:
@@ -238,10 +238,130 @@ entry:
 
 define <2 x i64> @test_srlq_3(<2 x i64> %InVec) {
 entry:
-  %shl = lshr <2 x i64> %InVec, <i64 64, i64 64>
+  %shl = lshr <2 x i64> %InVec, <i64 63, i64 63>
   ret <2 x i64> %shl
 }
 
 ; CHECK-LABEL: test_srlq_3:
-; CHECK: xorps   %xmm0, %xmm0
+; CHECK: psrlq $63, %xmm0
+; CHECK-NEXT: ret
+
+
+; CHECK-LABEL: sra_sra_v4i32:
+; CHECK: psrad $6, %xmm0
+; CHECK-NEXT: retq
+define <4 x i32> @sra_sra_v4i32(<4 x i32> %x) nounwind {
+  %sra0 = ashr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %sra1 = ashr <4 x i32> %sra0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %sra1
+}
+
+; CHECK-LABEL: @srl_srl_v4i32
+; CHECK: psrld $6, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @srl_srl_v4i32(<4 x i32> %x) nounwind {
+  %srl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %srl1
+}
+
+; CHECK-LABEL: @srl_shl_v4i32
+; CHECK: andps
+; CHECK-NEXT: retq
+define <4 x i32> @srl_shl_v4i32(<4 x i32> %x) nounwind {
+  %srl0 = shl <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+  %srl1 = lshr <4 x i32> %srl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %srl1
+}
+
+; CHECK-LABEL: @srl_sra_31_v4i32
+; CHECK: psrld $31, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @srl_sra_31_v4i32(<4 x i32> %x, <4 x i32> %y) nounwind {
+  %sra = ashr <4 x i32> %x, %y
+  %srl1 = lshr <4 x i32> %sra, <i32 31, i32 31, i32 31, i32 31>
+  ret <4 x i32> %srl1
+}
+
+; CHECK-LABEL: @shl_shl_v4i32
+; CHECK: pslld $6, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @shl_shl_v4i32(<4 x i32> %x) nounwind {
+  %shl0 = shl <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @shl_sra_v4i32
+; CHECK: andps
+; CHECK-NEXT: ret
+define <4 x i32> @shl_sra_v4i32(<4 x i32> %x) nounwind {
+  %shl0 = ashr <4 x i32> %x, <i32 4, i32 4, i32 4, i32 4>
+  %shl1 = shl <4 x i32> %shl0, <i32 4, i32 4, i32 4, i32 4>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @shl_srl_v4i32
+; CHECK: pslld $3, %xmm0
+; CHECK-NEXT: pand
+; CHECK-NEXT: ret
+define <4 x i32> @shl_srl_v4i32(<4 x i32> %x) nounwind {
+  %shl0 = lshr <4 x i32> %x, <i32 2, i32 2, i32 2, i32 2>
+  %shl1 = shl <4 x i32> %shl0, <i32 5, i32 5, i32 5, i32 5>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @shl_zext_srl_v4i32
+; CHECK: andps
 ; CHECK-NEXT: ret
+define <4 x i32> @shl_zext_srl_v4i32(<4 x i16> %x) nounwind {
+  %srl = lshr <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
+  %zext = zext <4 x i16> %srl to <4 x i32>
+  %shl = shl <4 x i32> %zext, <i32 2, i32 2, i32 2, i32 2>
+  ret <4 x i32> %shl
+}
+
+; CHECK: @sra_trunc_srl_v4i32
+; CHECK: psrad $19, %xmm0
+; CHECK-NEXT: retq
+define <4 x i16> @sra_trunc_srl_v4i32(<4 x i32> %x) nounwind {
+  %srl = lshr <4 x i32> %x, <i32 16, i32 16, i32 16, i32 16>
+  %trunc = trunc <4 x i32> %srl to <4 x i16>
+  %sra = ashr <4 x i16> %trunc, <i16 3, i16 3, i16 3, i16 3>
+  ret <4 x i16> %sra
+}
+
+; CHECK-LABEL: @shl_zext_shl_v4i32
+; CHECK: pand
+; CHECK-NEXT: pslld $19, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @shl_zext_shl_v4i32(<4 x i16> %x) nounwind {
+  %shl0 = shl <4 x i16> %x, <i16 2, i16 2, i16 2, i16 2>
+  %ext = zext <4 x i16> %shl0 to <4 x i32>
+  %shl1 = shl <4 x i32> %ext, <i32 17, i32 17, i32 17, i32 17>
+  ret <4 x i32> %shl1
+}
+
+; CHECK-LABEL: @sra_v4i32
+; CHECK: psrad $3, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @sra_v4i32(<4 x i32> %x) nounwind {
+  %sra = ashr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sra
+}
+
+; CHECK-LABEL: @srl_v4i32
+; CHECK: psrld $3, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @srl_v4i32(<4 x i32> %x) nounwind {
+  %sra = lshr <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sra
+}
+
+; CHECK-LABEL: @shl_v4i32
+; CHECK: pslld $3, %xmm0
+; CHECK-NEXT: ret
+define <4 x i32> @shl_v4i32(<4 x i32> %x) nounwind {
+  %sra = shl <4 x i32> %x, <i32 3, i32 3, i32 3, i32 3>
+  ret <4 x i32> %sra
+}
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 9147c22..628dba0 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -9,10 +9,10 @@ define void @test1(<2 x double>* %r, <2 x double>* %A, double %B) nounwind  {
 	ret void
 
 ; CHECK-LABEL: test1:
-; CHECK: 	movl	8(%esp), %eax
-; CHECK-NEXT: 	movapd	(%eax), %xmm0
+; CHECK: 	movl	4(%esp), %eax
+; CHECK-NEXT: 	movl	8(%esp), %ecx
+; CHECK-NEXT: 	movapd	(%ecx), %xmm0
 ; CHECK-NEXT: 	movlpd	12(%esp), %xmm0
-; CHECK-NEXT: 	movl	4(%esp), %eax
 ; CHECK-NEXT: 	movapd	%xmm0, (%eax)
 ; CHECK-NEXT: 	ret
 }
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index a32f5de..4681fde 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -4,7 +4,7 @@
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %v1, <4 x float> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
   ret <4 x float> %vsel
 }
 
@@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i8> %v1, <4 x i8> %v2
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
   ret <4 x i8> %vsel
 }
 
@@ -21,7 +21,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i16> %v1, <4 x i16> %v2
+  %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
   ret <4 x i16> %vsel
 }
 
@@ -30,13 +30,13 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
 ;CHECK: blendvps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
-  %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %v1, <4 x i32> %v2
+  %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
   ret <4 x i32> %vsel
 }
 
 
 ;CHECK-LABEL: vsel_double:
-;CHECK: blendvpd
+;CHECK: movsd
 ;CHECK: ret
 define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -45,7 +45,7 @@ define <4 x double> @vsel_double(<4 x double> %v1, <4 x double> %v2) {
 
 
 ;CHECK-LABEL: vsel_i64:
-;CHECK: blendvpd
+;CHECK: movsd
 ;CHECK: ret
 define <4 x i64> @vsel_i64(<4 x i64> %v1, <4 x i64> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i64> %v1, <4 x i64> %v2
diff --git a/test/CodeGen/X86/ssp-data-layout.ll b/test/CodeGen/X86/ssp-data-layout.ll
new file mode 100644
index 0000000..e76ad7b
--- /dev/null
+++ b/test/CodeGen/X86/ssp-data-layout.ll
@@ -0,0 +1,510 @@
+; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-pc-linux-gnu -mcpu=corei7 -o - | FileCheck %s
+;  This test is fairly fragile.  The goal is to ensure that "large" stack
+;  objects are allocated closest to the stack protector (i.e., farthest away 
+;  from the Stack Pointer.)  In standard SSP mode this means that large (>=
+;  ssp-buffer-size) arrays and structures containing such arrays are
+;  closet to the protector.  With sspstrong and sspreq this means large
+;  arrays/structures-with-arrays are closest, followed by small (< ssp-buffer-size)
+;  arrays/structures-with-arrays, and then addr-taken variables.
+;
+;  Ideally, we only want verify that the objects appear in the correct groups
+;  and that the groups have the correct relative stack offset.  The ordering
+;  within a group is not relevant to this test.  Unfortunately, there is not
+;  an elegant way to do this, so just match the offset for each object.
+; RUN: llc < %s -disable-fp-elim -mtriple=x86_64-unknown-unknown -O0 -mcpu=corei7 -o - \
+; RUN:   | FileCheck --check-prefix=FAST-NON-LIN %s
+; FastISel was not setting the StackProtectorIndex when lowering
+; Intrinsic::stackprotector and as a result the stack re-arrangement code was
+; never applied.  This problem only shows up on non-Linux platforms because on
+; Linux the stack protector cookie is loaded from a special address space which
+; always triggers standard ISel.  Run a basic test to ensure that at -O0
+; on a non-linux target the data layout rules are triggered.
+
+%struct.struct_large_char = type { [8 x i8] }
+%struct.struct_small_char = type { [2 x i8] }
+%struct.struct_large_nonchar = type { [8 x i32] }
+%struct.struct_small_nonchar = type { [2 x i16] }
+
+define void @layout_ssp() ssp {
+entry:
+; Expected stack layout for ssp is
+;  -16 large_char          . Group 1, nested arrays, arrays >= ssp-buffer-size
+;  -24 struct_large_char   .
+;  -28 scalar1             | Everything else
+;  -32 scalar2
+;  -36 scalar3
+;  -40 addr-of
+;  -44 small_nonchar
+;  -80 large_nonchar
+;  -82 small_char
+;  -88 struct_small_char
+;  -120 struct_large_nonchar
+;  -128 struct_small_nonchar
+
+; CHECK: layout_ssp:
+; CHECK: call{{l|q}} get_scalar1
+; CHECK: movl %eax, -28(
+; CHECK: call{{l|q}} end_scalar1
+
+; CHECK: call{{l|q}} get_scalar2
+; CHECK: movl %eax, -32(
+; CHECK: call{{l|q}} end_scalar2
+
+; CHECK: call{{l|q}} get_scalar3
+; CHECK: movl %eax, -36(
+; CHECK: call{{l|q}} end_scalar3
+
+; CHECK: call{{l|q}} get_addrof
+; CHECK: movl %eax, -40(
+; CHECK: call{{l|q}} end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: movw %ax, -44(
+; CHECK: call{{l|q}} end_small_nonchar
+
+; CHECK: call{{l|q}} get_large_nonchar
+; CHECK: movl %eax, -80(
+; CHECK: call{{l|q}} end_large_nonchar
+
+; CHECK: call{{l|q}} get_small_char
+; CHECK: movb %al, -82(
+; CHECK: call{{l|q}} end_small_char
+
+; CHECK: call{{l|q}} get_large_char
+; CHECK: movb %al, -16(
+; CHECK: call{{l|q}} end_large_char
+
+; CHECK: call{{l|q}} get_struct_large_char
+; CHECK: movb %al, -24(
+; CHECK: call{{l|q}} end_struct_large_char
+
+; CHECK: call{{l|q}} get_struct_small_char
+; CHECK: movb %al, -88(
+; CHECK: call{{l|q}} end_struct_small_char
+
+; CHECK: call{{l|q}} get_struct_large_nonchar
+; CHECK: movl %eax, -120(
+; CHECK: call{{l|q}} end_struct_large_nonchar
+
+; CHECK: call{{l|q}} get_struct_small_nonchar
+; CHECK: movw %ax, -128(
+; CHECK: call{{l|q}} end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @layout_sspstrong() nounwind uwtable sspstrong {
+entry:
+; Expected stack layout for sspstrong is
+;   -48   large_nonchar          . Group 1, nested arrays,
+;   -56   large_char             .  arrays >= ssp-buffer-size
+;   -64   struct_large_char      .
+;   -96   struct_large_nonchar   .
+;   -100  small_non_char         | Group 2, nested arrays, 
+;   -102  small_char             |  arrays < ssp-buffer-size
+;   -104  struct_small_char      |
+;   -112  struct_small_nonchar   |
+;   -116  addrof                 * Group 3, addr-of local
+;   -120  scalar                 + Group 4, everything else
+;   -124  scalar                 +
+;   -128  scalar                 +
+;   
+; CHECK: layout_sspstrong:
+; CHECK: call{{l|q}} get_scalar1
+; CHECK: movl %eax, -120(
+; CHECK: call{{l|q}} end_scalar1
+
+; CHECK: call{{l|q}} get_scalar2
+; CHECK: movl %eax, -124(
+; CHECK: call{{l|q}} end_scalar2
+
+; CHECK: call{{l|q}} get_scalar3
+; CHECK: movl %eax, -128(
+; CHECK: call{{l|q}} end_scalar3
+
+; CHECK: call{{l|q}} get_addrof
+; CHECK: movl %eax, -116(
+; CHECK: call{{l|q}} end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: movw %ax, -100(
+; CHECK: call{{l|q}} end_small_nonchar
+
+; CHECK: call{{l|q}} get_large_nonchar
+; CHECK: movl %eax, -48(
+; CHECK: call{{l|q}} end_large_nonchar
+
+; CHECK: call{{l|q}} get_small_char
+; CHECK: movb %al, -102(
+; CHECK: call{{l|q}} end_small_char
+
+; CHECK: call{{l|q}} get_large_char
+; CHECK: movb %al, -56(
+; CHECK: call{{l|q}} end_large_char
+
+; CHECK: call{{l|q}} get_struct_large_char
+; CHECK: movb %al, -64(
+; CHECK: call{{l|q}} end_struct_large_char
+
+; CHECK: call{{l|q}} get_struct_small_char
+; CHECK: movb %al, -104(
+; CHECK: call{{l|q}} end_struct_small_char
+
+; CHECK: call{{l|q}} get_struct_large_nonchar
+; CHECK: movl %eax, -96(
+; CHECK: call{{l|q}} end_struct_large_nonchar
+
+; CHECK: call{{l|q}} get_struct_small_nonchar
+; CHECK: movw %ax, -112(
+; CHECK: call{{l|q}} end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @layout_sspreq() nounwind uwtable sspreq {
+entry:
+; Expected stack layout for sspreq is the same as sspstrong
+;   
+; CHECK: layout_sspreq:
+; CHECK: call{{l|q}} get_scalar1
+; CHECK: movl %eax, -120(
+; CHECK: call{{l|q}} end_scalar1
+
+; CHECK: call{{l|q}} get_scalar2
+; CHECK: movl %eax, -124(
+; CHECK: call{{l|q}} end_scalar2
+
+; CHECK: call{{l|q}} get_scalar3
+; CHECK: movl %eax, -128(
+; CHECK: call{{l|q}} end_scalar3
+
+; CHECK: call{{l|q}} get_addrof
+; CHECK: movl %eax, -116(
+; CHECK: call{{l|q}} end_addrof
+
+; CHECK: get_small_nonchar
+; CHECK: movw %ax, -100(
+; CHECK: call{{l|q}} end_small_nonchar
+
+; CHECK: call{{l|q}} get_large_nonchar
+; CHECK: movl %eax, -48(
+; CHECK: call{{l|q}} end_large_nonchar
+
+; CHECK: call{{l|q}} get_small_char
+; CHECK: movb %al, -102(
+; CHECK: call{{l|q}} end_small_char
+
+; CHECK: call{{l|q}} get_large_char
+; CHECK: movb %al, -56(
+; CHECK: call{{l|q}} end_large_char
+
+; CHECK: call{{l|q}} get_struct_large_char
+; CHECK: movb %al, -64(
+; CHECK: call{{l|q}} end_struct_large_char
+
+; CHECK: call{{l|q}} get_struct_small_char
+; CHECK: movb %al, -104(
+; CHECK: call{{l|q}} end_struct_small_char
+
+; CHECK: call{{l|q}} get_struct_large_nonchar
+; CHECK: movl %eax, -96(
+; CHECK: call{{l|q}} end_struct_large_nonchar
+
+; CHECK: call{{l|q}} get_struct_small_nonchar
+; CHECK: movw %ax, -112(
+; CHECK: call{{l|q}} end_struct_small_nonchar
+  %x = alloca i32, align 4
+  %y = alloca i32, align 4
+  %z = alloca i32, align 4
+  %ptr = alloca i32, align 4
+  %small2 = alloca [2 x i16], align 2
+  %large2 = alloca [8 x i32], align 16
+  %small = alloca [2 x i8], align 1
+  %large = alloca [8 x i8], align 1
+  %a = alloca %struct.struct_large_char, align 1
+  %b = alloca %struct.struct_small_char, align 1
+  %c = alloca %struct.struct_large_nonchar, align 8
+  %d = alloca %struct.struct_small_nonchar, align 2
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call i32 @get_scalar2()
+  store i32 %call1, i32* %y, align 4
+  call void @end_scalar2()
+  %call2 = call i32 @get_scalar3()
+  store i32 %call2, i32* %z, align 4
+  call void @end_scalar3()
+  %call3 = call i32 @get_addrof()
+  store i32 %call3, i32* %ptr, align 4
+  call void @end_addrof()
+  %call4 = call signext i16 @get_small_nonchar()
+  %arrayidx = getelementptr inbounds [2 x i16]* %small2, i32 0, i64 0
+  store i16 %call4, i16* %arrayidx, align 2
+  call void @end_small_nonchar()
+  %call5 = call i32 @get_large_nonchar()
+  %arrayidx6 = getelementptr inbounds [8 x i32]* %large2, i32 0, i64 0
+  store i32 %call5, i32* %arrayidx6, align 4
+  call void @end_large_nonchar()
+  %call7 = call signext i8 @get_small_char()
+  %arrayidx8 = getelementptr inbounds [2 x i8]* %small, i32 0, i64 0
+  store i8 %call7, i8* %arrayidx8, align 1
+  call void @end_small_char()
+  %call9 = call signext i8 @get_large_char()
+  %arrayidx10 = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call9, i8* %arrayidx10, align 1
+  call void @end_large_char()
+  %call11 = call signext i8 @get_struct_large_char()
+  %foo = getelementptr inbounds %struct.struct_large_char* %a, i32 0, i32 0
+  %arrayidx12 = getelementptr inbounds [8 x i8]* %foo, i32 0, i64 0
+  store i8 %call11, i8* %arrayidx12, align 1
+  call void @end_struct_large_char()
+  %call13 = call signext i8 @get_struct_small_char()
+  %foo14 = getelementptr inbounds %struct.struct_small_char* %b, i32 0, i32 0
+  %arrayidx15 = getelementptr inbounds [2 x i8]* %foo14, i32 0, i64 0
+  store i8 %call13, i8* %arrayidx15, align 1
+  call void @end_struct_small_char()
+  %call16 = call i32 @get_struct_large_nonchar()
+  %foo17 = getelementptr inbounds %struct.struct_large_nonchar* %c, i32 0, i32 0
+  %arrayidx18 = getelementptr inbounds [8 x i32]* %foo17, i32 0, i64 0
+  store i32 %call16, i32* %arrayidx18, align 4
+  call void @end_struct_large_nonchar()
+  %call19 = call signext i16 @get_struct_small_nonchar()
+  %foo20 = getelementptr inbounds %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %arrayidx21 = getelementptr inbounds [2 x i16]* %foo20, i32 0, i64 0
+  store i16 %call19, i16* %arrayidx21, align 2
+  call void @end_struct_small_nonchar()
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  %arraydecay22 = getelementptr inbounds [2 x i8]* %small, i32 0, i32 0
+  %arraydecay23 = getelementptr inbounds [8 x i32]* %large2, i32 0, i32 0
+  %arraydecay24 = getelementptr inbounds [2 x i16]* %small2, i32 0, i32 0
+  %0 = load i32* %x, align 4
+  %1 = load i32* %y, align 4
+  %2 = load i32* %z, align 4
+  %coerce.dive = getelementptr %struct.struct_large_char* %a, i32 0, i32 0
+  %3 = bitcast [8 x i8]* %coerce.dive to i64*
+  %4 = load i64* %3, align 1
+  %coerce.dive25 = getelementptr %struct.struct_small_char* %b, i32 0, i32 0
+  %5 = bitcast [2 x i8]* %coerce.dive25 to i16*
+  %6 = load i16* %5, align 1
+  %coerce.dive26 = getelementptr %struct.struct_small_nonchar* %d, i32 0, i32 0
+  %7 = bitcast [2 x i16]* %coerce.dive26 to i32*
+  %8 = load i32* %7, align 1
+  call void @takes_all(i64 %4, i16 %6, %struct.struct_large_nonchar* byval align 8 %c, i32 %8, i8* %arraydecay, i8* %arraydecay22, i32* %arraydecay23, i16* %arraydecay24, i32* %ptr, i32 %0, i32 %1, i32 %2)
+  ret void
+}
+
+define void @fast_non_linux() ssp {
+entry:
+; FAST-NON-LIN: fast_non_linux:
+; FAST-NON-LIN: call{{l|q}} get_scalar1
+; FAST-NON-LIN: movl %eax, -20(
+; FAST-NON-LIN: call{{l|q}} end_scalar1
+
+; FAST-NON-LIN: call{{l|q}} get_large_char
+; FAST-NON-LIN: movb %al, -16(
+; FAST-NON-LIN: call{{l|q}} end_large_char
+  %x = alloca i32, align 4
+  %large = alloca [8 x i8], align 1
+  %call = call i32 @get_scalar1()
+  store i32 %call, i32* %x, align 4
+  call void @end_scalar1()
+  %call1 = call signext i8 @get_large_char()
+  %arrayidx = getelementptr inbounds [8 x i8]* %large, i32 0, i64 0
+  store i8 %call1, i8* %arrayidx, align 1
+  call void @end_large_char()
+  %0 = load i32* %x, align 4
+  %arraydecay = getelementptr inbounds [8 x i8]* %large, i32 0, i32 0
+  call void @takes_two(i32 %0, i8* %arraydecay)
+  ret void
+}
+
+declare i32 @get_scalar1()
+declare void @end_scalar1()
+
+declare i32 @get_scalar2()
+declare void @end_scalar2()
+
+declare i32 @get_scalar3()
+declare void @end_scalar3()
+
+declare i32 @get_addrof()
+declare void @end_addrof()
+
+declare signext i16 @get_small_nonchar()
+declare void @end_small_nonchar()
+
+declare i32 @get_large_nonchar()
+declare void @end_large_nonchar()
+
+declare signext i8 @get_small_char()
+declare void @end_small_char()
+
+declare signext i8 @get_large_char()
+declare void @end_large_char()
+
+declare signext i8 @get_struct_large_char()
+declare void @end_struct_large_char()
+
+declare signext i8 @get_struct_small_char()
+declare void @end_struct_small_char()
+
+declare i32 @get_struct_large_nonchar()
+declare void @end_struct_large_nonchar()
+
+declare signext i16 @get_struct_small_nonchar()
+declare void @end_struct_small_nonchar()
+
+declare void @takes_all(i64, i16, %struct.struct_large_nonchar* byval align 8, i32, i8*, i8*, i32*, i16*, i32*, i32, i32, i32)
+declare void @takes_two(i32, i8*)
diff --git a/test/CodeGen/X86/stack-align-memcpy.ll b/test/CodeGen/X86/stack-align-memcpy.ll
index 87bb85f..0cc3aa8 100644
--- a/test/CodeGen/X86/stack-align-memcpy.ll
+++ b/test/CodeGen/X86/stack-align-memcpy.ll
@@ -2,6 +2,9 @@
 
 %struct.foo = type { [88 x i8] }
 
+declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
+declare void @baz(i8*) nounwind
+
 ; PR15249
 ; We can't use rep;movsl here because it clobbers the base pointer in %esi.
 define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
@@ -15,4 +18,26 @@ define void @test1(%struct.foo* nocapture %x, i32 %y) nounwind {
 ; CHECK-NOT: rep;movsl
 }
 
-declare void @bar(i8* nocapture, %struct.foo* align 4 byval) nounwind
+; PR19012
+; Also don't clobber %esi if the dynamic alloca comes after the memcpy.
+define void @test2(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+  call void @bar(i8* %z, %struct.foo* align 4 byval %x)
+  %dynalloc = alloca i8, i32 %y, align 1
+  call void @baz(i8* %dynalloc)
+  ret void
+
+; CHECK-LABEL: test2:
+; CHECK: movl %esp, %esi
+; CHECK-NOT: rep;movsl
+}
+
+; Check that we do use rep movs if we make the alloca static.
+define void @test3(%struct.foo* nocapture %x, i32 %y, i8* %z) nounwind {
+  call void @bar(i8* %z, %struct.foo* align 4 byval %x)
+  %statalloc = alloca i8, i32 8, align 1
+  call void @baz(i8* %statalloc)
+  ret void
+
+; CHECK-LABEL: test3:
+; CHECK: rep;movsl
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index bd27ac3..fb7e2db 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -30,7 +30,7 @@ attributes #0 = { sspreq }
 !2 = metadata !{metadata !3}
 !3 = metadata !{i32 786436, metadata !1, metadata !4, metadata !"", i32 20, i64 32, i64 32, i32 0, i32 0, null, metadata !6, i32 0, null, null, null} ; [ DW_TAG_enumeration_type ] [line 20, size 32, align 32, offset 0] [def] [from ]
 !4 = metadata !{i32 786451, metadata !1, null, metadata !"C", i32 19, i64 8, i64 8, i32 0, i32 0, null, metadata !5, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [C] [line 19, size 8, align 8, offset 0] [def] [from ]
-!5 = metadata !{i32 0}
+!5 = metadata !{}
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
 !8 = metadata !{metadata !9}
diff --git a/test/CodeGen/X86/stackmap-liveness.ll b/test/CodeGen/X86/stackmap-liveness.ll
new file mode 100644
index 0000000..9ce5254
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-liveness.ll
@@ -0,0 +1,245 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-stackmap-liveness| FileCheck -check-prefix=STACK %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx -disable-fp-elim -enable-patchpoint-liveness| FileCheck -check-prefix=PATCH %s
+;
+; Note: Print verbose stackmaps using -debug-only=stackmaps.
+
+; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
+; CHECK-NEXT:   __LLVM_StackMaps:
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 2
+; Num LargeConstants
+; CHECK-NEXT:   .long   0
+; Num Callsites
+; CHECK-NEXT:   .long   5
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _stackmap_liveness
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _mixed_liveness
+; CHECK-NEXT:   .quad 8
+
+define void @stackmap_liveness() {
+entry:
+  %a1 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+; StackMap 1 (no liveness information available)
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; CHECK-NEXT:   .short  0
+; Align
+; CHECK-NEXT:   .align  3
+
+; StackMap 1 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; STACK-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; STACK-NEXT:   .short  7
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 1 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 1, i32 5)
+  %a2 = call i64 asm sideeffect "", "={r8}"() nounwind
+  %a3 = call i8 asm sideeffect "", "={ah}"() nounwind
+  %a4 = call <4 x double> asm sideeffect "", "={ymm0}"() nounwind
+  %a5 = call <4 x double> asm sideeffect "", "={ymm1}"() nounwind
+
+; StackMap 2 (no liveness information available)
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; CHECK-NEXT:   .short  0
+; Align
+; CHECK-NEXT:   .align  3
+
+; StackMap 2 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 6
+; STACK-NEXT:   .short  6
+; LiveOut Entry 1: %RAX (1 bytes) --> %AL or %AH
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 1
+; LiveOut Entry 2: %RSP (8 bytes)
+; STACK-NEXT:   .short  7
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 3: %R8 (8 bytes)
+; STACK-NEXT:   .short  8
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 4: %YMM0 (32 bytes)
+; STACK-NEXT:   .short  17
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 32
+; LiveOut Entry 5: %YMM1 (32 bytes)
+; STACK-NEXT:   .short  18
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 32
+; LiveOut Entry 6: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 2 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 2, i32 5)
+  call void asm sideeffect "", "{r8},{ah},{ymm0},{ymm1}"(i64 %a2, i8 %a3, <4 x double> %a4, <4 x double> %a5) nounwind
+
+; StackMap 3 (no liveness information available)
+; CHECK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  0
+; Padding
+; CHECK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; CHECK-NEXT:   .short  0
+; Align
+; CHECK-NEXT:   .align  3
+
+; StackMap 3 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_stackmap_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; STACK-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; STACK-NEXT:   .short  7
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 3 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_stackmap_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 5)
+  call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
+  ret void
+}
+
+define void @mixed_liveness() {
+entry:
+  %a1 = call <2 x double> asm sideeffect "", "={xmm2}"() nounwind
+; StackMap 4 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_mixed_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 1
+; STACK-NEXT:   .short  1
+; LiveOut Entry 1: %YMM2 (16 bytes) --> %XMM2
+; STACK-NEXT:   .short  19
+; STACK-NEXT:   .byte 0
+; STACK-NEXT:   .byte 16
+; Align
+; STACK-NEXT:   .align  3
+
+
+; StackMap 5 (stackmap liveness information enabled)
+; STACK-LABEL:  .long L{{.*}}-_mixed_liveness
+; STACK-NEXT:   .short  0
+; STACK-NEXT:   .short  0
+; Padding
+; STACK-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; STACK-NEXT:   .short  0
+; Align
+; STACK-NEXT:   .align  3
+
+; StackMap 4 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_mixed_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 0
+; PATCH-NEXT:   .short  0
+; Align
+; PATCH-NEXT:   .align  3
+
+; StackMap 5 (patchpoint liveness information enabled)
+; PATCH-LABEL:  .long L{{.*}}-_mixed_liveness
+; PATCH-NEXT:   .short  0
+; PATCH-NEXT:   .short  0
+; Padding
+; PATCH-NEXT:   .short  0
+; Num LiveOut Entries: 2
+; PATCH-NEXT:   .short  2
+; LiveOut Entry 1: %RSP (8 bytes)
+; PATCH-NEXT:   .short  7
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 8
+; LiveOut Entry 2: %YMM2 (16 bytes) --> %XMM2
+; PATCH-NEXT:   .short  19
+; PATCH-NEXT:   .byte 0
+; PATCH-NEXT:   .byte 16
+; Align
+; PATCH-NEXT:   .align  3
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 4, i32 5)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 5, i32 0, i8* null, i32 0)
+  call void asm sideeffect "", "{xmm2}"(<2 x double> %a1) nounwind
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/stackmap-nops.ll b/test/CodeGen/X86/stackmap-nops.ll
new file mode 100644
index 0000000..5a78f24
--- /dev/null
+++ b/test/CodeGen/X86/stackmap-nops.ll
@@ -0,0 +1,230 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
+
+define void @nop_test() {
+entry:
+; CHECK-LABEL: nop_test:
+; CHECK:      nop
+; CHECK:      xchgw %ax, %ax
+; CHECK:      nopl (%rax)
+; CHECK:      nopl 8(%rax)
+; CHECK:      nopl 8(%rax,%rax)
+; CHECK:      nopw 8(%rax,%rax)
+; CHECK:      nopl 512(%rax)
+; CHECK:      nopl 512(%rax,%rax)
+; CHECK:      nopw 512(%rax,%rax)
+; CHECK:      nopw %cs:512(%rax,%rax)
+
+; 11
+; CHECK:      .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 12
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 13
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 14
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 15
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 16
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nop
+
+; 17
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: xchgw %ax, %ax
+
+; 18
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl (%rax)
+
+; 19
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 8(%rax)
+
+; 20
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 8(%rax,%rax)
+
+; 21
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopw 8(%rax,%rax)
+
+; 22
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 512(%rax)
+
+; 23
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopl 512(%rax,%rax)
+
+; 24
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopw 512(%rax,%rax)
+
+; 25
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 26
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 27
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 28
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+;29
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+
+; 30
+; CHECK:      .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: .byte 102
+; CHECK-NEXT: nopw %cs:512(%rax,%rax)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  0, i32  0)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  1, i32  1)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  2, i32  2)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  3, i32  3)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  4, i32  4)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  5, i32  5)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  6, i32  6)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  7, i32  7)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  8, i32  8)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64  9, i32  9)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 10, i32 10)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 11, i32 11)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 12)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 13)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 14)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 15)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 16)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 17, i32 17)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 18, i32 18)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 19, i32 19)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 20, i32 20)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 21, i32 21)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 22, i32 22)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 23, i32 23)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 24, i32 24)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 25, i32 25)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 26, i32 26)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 27, i32 27)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 28, i32 28)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 29, i32 29)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 30, i32 30)
+  ret void
+}
+
+declare void @llvm.experimental.stackmap(i64, i32, ...)
diff --git a/test/CodeGen/X86/stackmap.ll b/test/CodeGen/X86/stackmap.ll
index ed95583..8567037 100644
--- a/test/CodeGen/X86/stackmap.ll
+++ b/test/CodeGen/X86/stackmap.ll
@@ -1,27 +1,74 @@
-; RUN: llc < %s -mtriple=x86_64-apple-darwin -disable-fp-elim | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -disable-fp-elim | FileCheck %s
 ;
 ; Note: Print verbose stackmaps using -debug-only=stackmaps.
 
 ; CHECK-LABEL:  .section  __LLVM_STACKMAPS,__llvm_stackmaps
 ; CHECK-NEXT:  __LLVM_StackMaps:
-; CHECK-NEXT:   .long   0
+; Header
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 0
+; CHECK-NEXT:   .short 0
+; Num Functions
+; CHECK-NEXT:   .long 15
 ; Num LargeConstants
-; CHECK-NEXT:   .long   1
-; CHECK-NEXT:   .quad   4294967296
+; CHECK-NEXT:   .long 3
 ; Num Callsites
-; CHECK-NEXT:   .long   11
+; CHECK-NEXT:   .long 19
+
+; Functions and stack size
+; CHECK-NEXT:   .quad _constantargs
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _osrinline
+; CHECK-NEXT:   .quad 24
+; CHECK-NEXT:   .quad _osrcold
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _propertyRead
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _propertyWrite
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _jsVoidCall
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _jsIntCall
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _spilledValue
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _spilledStackMapValue
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _spillSubReg
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _subRegOffset
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _liveConstant
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _directFrameIdx
+; CHECK-NEXT:   .quad 56
+; CHECK-NEXT:   .quad _longid
+; CHECK-NEXT:   .quad 8
+; CHECK-NEXT:   .quad _clobberScratch
+; CHECK-NEXT:   .quad 56
+
+; Large Constants
+; CHECK-NEXT:   .quad   2147483648
+; CHECK-NEXT:   .quad   4294967295
+; CHECK-NEXT:   .quad   4294967296
 
+; Callsites
 ; Constant arguments
 ;
-; CHECK-NEXT:   .long   1
+; CHECK-NEXT:   .quad   1
 ; CHECK-NEXT:   .long   L{{.*}}-_constantargs
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .short  4
+; CHECK-NEXT:   .short  12
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  0
-; CHECK-NEXT:   .long   65535
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
 ; SmallConstant
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   8
@@ -31,24 +78,58 @@
 ; CHECK-NEXT:   .byte   4
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2000000000
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2147483647
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .long   -1
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   0
 ; LargeConstant at index 0
 ; CHECK-NEXT:   .byte   5
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .long   0
+; LargeConstant at index 1
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   1
+; LargeConstant at index 2
+; CHECK-NEXT:   .byte   5
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   2
+; SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   -1
 
 define void @constantargs() {
 entry:
   %0 = inttoptr i64 12345 to i8*
-  tail call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 1, i32 15, i8* %0, i32 0, i64 65535, i64 65536, i64 4294967295, i64 4294967296)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 1, i32 15, i8* %0, i32 0, i16 65535, i16 -1, i32 65536, i32 2000000000, i32 2147483647, i32 -1, i32 4294967295, i32 4294967296, i64 2147483648, i64 4294967295, i64 4294967296, i64 -1)
   ret void
 }
 
 ; Inline OSR Exit
 ;
-; CHECK-NEXT:   .long   3
-; CHECK-NEXT:   .long   L{{.*}}-_osrinline
+; CHECK-LABEL:  .long   L{{.*}}-_osrinline
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -64,7 +145,7 @@ entry:
   ; Runtime void->void call.
   call void inttoptr (i64 -559038737 to void ()*)()
   ; Followed by inline OSR patchpoint with 12-byte shadow and 2 live vars.
-  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 3, i32 12, i64 %a, i64 %b)
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 3, i32 12, i64 %a, i64 %b)
   ret void
 }
 
@@ -72,8 +153,7 @@ entry:
 ;
 ; 2 live variables in register.
 ;
-; CHECK-NEXT:   .long  4
-; CHECK-NEXT:   .long   L{{.*}}-_osrcold
+; CHECK-LABEL:  .long   L{{.*}}-_osrcold
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -83,7 +163,7 @@ entry:
 ; CHECK-NEXT:   .byte   1
 ; CHECK-NEXT:   .byte   8
 ; CHECK-NEXT:   .short  {{[0-9]+}}
-; CHECK-NEXT:   .long  0
+; CHECK-NEXT:   .long   0
 define void @osrcold(i64 %a, i64 %b) {
 entry:
   %test = icmp slt i64 %a, %b
@@ -91,40 +171,48 @@ entry:
 cold:
   ; OSR patchpoint with 12-byte nop-slide and 2 live vars.
   %thunk = inttoptr i64 -559038737 to i8*
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4, i32 15, i8* %thunk, i32 0, i64 %a, i64 %b)
   unreachable
 ret:
   ret void
 }
 
 ; Property Read
-; CHECK-NEXT:  .long  5
-; CHECK-NEXT:   .long   L{{.*}}-_propertyRead
-; CHECK-NEXT:  .short  0
-; CHECK-NEXT:  .short  0
-;
-; FIXME: There are currently no stackmap entries. After moving to
-; AnyRegCC, we will have entries for the object and return value.
+; CHECK-LABEL:  .long   L{{.*}}-_propertyRead
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
 define i64 @propertyRead(i64* %obj) {
 entry:
   %resolveRead = inttoptr i64 -559038737 to i8*
-  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
+  %result = call anyregcc i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 5, i32 15, i8* %resolveRead, i32 1, i64* %obj)
   %add = add i64 %result, 3
   ret i64 %add
 }
 
 ; Property Write
-; CHECK-NEXT:  .long  6
-; CHECK-NEXT:   .long   L{{.*}}-_propertyWrite
-; CHECK-NEXT:  .short  0
-; CHECK-NEXT:  .short  0
-;
-; FIXME: There are currently no stackmap entries. After moving to
-; AnyRegCC, we will have entries for the object and return value.
+; CHECK-LABEL:  .long   L{{.*}}-_propertyWrite
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .short  2
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
+; CHECK-NEXT:   .byte   1
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  {{[0-9]+}}
+; CHECK-NEXT:   .long   0
 define void @propertyWrite(i64 %dummy1, i64* %obj, i64 %dummy2, i64 %a) {
 entry:
   %resolveWrite = inttoptr i64 -559038737 to i8*
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
+  call anyregcc void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 6, i32 15, i8* %resolveWrite, i32 2, i64* %obj, i64 %a)
   ret void
 }
 
@@ -132,8 +220,7 @@ entry:
 ;
 ; 2 live variables in registers.
 ;
-; CHECK-NEXT:   .long  7
-; CHECK-NEXT:   .long   L{{.*}}-_jsVoidCall
+; CHECK-LABEL:  .long   L{{.*}}-_jsVoidCall
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -147,7 +234,7 @@ entry:
 define void @jsVoidCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 7, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   ret void
 }
 
@@ -155,8 +242,7 @@ entry:
 ;
 ; 2 live variables in registers.
 ;
-; CHECK:        .long  8
-; CHECK-NEXT:   .long   L{{.*}}-_jsIntCall
+; CHECK-LABEL:  .long   L{{.*}}-_jsIntCall
 ; CHECK-NEXT:   .short  0
 ; CHECK-NEXT:   .short  2
 ; CHECK-NEXT:   .byte   1
@@ -170,7 +256,7 @@ entry:
 define i64 @jsIntCall(i64 %dummy1, i64* %obj, i64 %arg, i64 %l1, i64 %l2) {
 entry:
   %resolveCall = inttoptr i64 -559038737 to i8*
-  %result = call i64 (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i32 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
+  %result = call i64 (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.i64(i64 8, i32 15, i8* %resolveCall, i32 2, i64* %obj, i64 %arg, i64 %l1, i64 %l2)
   %add = add i64 %result, 3
   ret i64 %add
 }
@@ -179,19 +265,18 @@ entry:
 ;
 ; Verify 17 stack map entries.
 ;
-; CHECK:      .long 11
-; CHECK-NEXT: .long L{{.*}}-_spilledValue
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .short 17
+; CHECK-LABEL:  .long L{{.*}}-_spilledValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 17
 ;
 ; Check that at least one is a spilled entry from RBP.
 ; Location: Indirect RBP + ...
-; CHECK:      .byte 3
-; CHECK-NEXT: .byte 8
-; CHECK-NEXT: .short 6
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 6
 define void @spilledValue(i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i32, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i32 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 11, i32 15, i8* null, i32 5, i64 %arg0, i64 %arg1, i64 %arg2, i64 %arg3, i64 %arg4, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
@@ -199,35 +284,33 @@ entry:
 ;
 ; Verify 17 stack map entries.
 ;
-; CHECK:       .long 12
-; CHECK-LABEL: .long L{{.*}}-_spilledStackMapValue
-; CHECK-NEXT:  .short 0
-; CHECK-NEXT:  .short 17
+; CHECK-LABEL:  .long L{{.*}}-_spilledStackMapValue
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .short 17
 ;
 ; Check that at least one is a spilled entry from RBP.
 ; Location: Indirect RBP + ...
-; CHECK:      .byte 3
-; CHECK-NEXT: .byte 8
-; CHECK-NEXT: .short 6
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 8
+; CHECK-NEXT:   .short 6
 define webkit_jscc void @spilledStackMapValue(i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16) {
 entry:
-  call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 12, i32 15, i64 %l0, i64 %l1, i64 %l2, i64 %l3, i64 %l4, i64 %l5, i64 %l6, i64 %l7, i64 %l8, i64 %l9, i64 %l10, i64 %l11, i64 %l12, i64 %l13, i64 %l14, i64 %l15, i64 %l16)
   ret void
 }
 
 ; Spill a subregister stackmap operand.
 ;
-; CHECK:       .long 13
-; CHECK-LABEL: .long L{{.*}}-_spillSubReg
-; CHECK-NEXT:  .short 0
+; CHECK-LABEL:  .long L{{.*}}-_spillSubReg
+; CHECK-NEXT:   .short 0
 ; 4 locations
-; CHECK-NEXT:  .short 1
+; CHECK-NEXT:   .short 1
 ;
 ; Check that the subregister operand is a 4-byte spill.
 ; Location: Indirect, 4-byte, RBP + ...
-; CHECK:      .byte 3
-; CHECK-NEXT: .byte 4
-; CHECK-NEXT: .short 6
+; CHECK:        .byte 3
+; CHECK-NEXT:   .byte 4
+; CHECK-NEXT:   .short 6
 define void @spillSubReg(i64 %arg) #0 {
 bb:
   br i1 undef, label %bb1, label %bb2
@@ -248,7 +331,7 @@ bb17:
 
 bb60:
   tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 13, i32 5, i32 %tmp32)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 13, i32 5, i32 %tmp32)
   unreachable
 
 bb61:
@@ -258,24 +341,23 @@ bb61:
 ; Map a single byte subregister. There is no DWARF register number, so
 ; we expect the register to be encoded with the proper size and spill offset. We don't know which
 ;
-; CHECK:       .long 14
-; CHECK-LABEL: .long L{{.*}}-_subRegOffset
-; CHECK-NEXT:  .short 0
+; CHECK-LABEL:  .long L{{.*}}-_subRegOffset
+; CHECK-NEXT:   .short 0
 ; 2 locations
-; CHECK-NEXT:  .short 2
+; CHECK-NEXT:   .short 2
 ;
 ; Check that the subregister operands are 1-byte spills.
 ; Location 0: Register, 4-byte, AL
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .short 0
-; CHECK-NEXT: .long 0
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .short 0
+; CHECK-NEXT:   .long 0
 ;
 ; Location 1: Register, 4-byte, BL
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .byte 1
-; CHECK-NEXT: .short 3
-; CHECK-NEXT: .long 0
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .byte 1
+; CHECK-NEXT:   .short 3
+; CHECK-NEXT:   .long 0
 define void @subRegOffset(i16 %arg) {
   %v = mul i16 %arg, 5
   %a0 = trunc i16 %v to i8
@@ -283,10 +365,105 @@ define void @subRegOffset(i16 %arg) {
   %arghi = lshr i16 %v, 8
   %a1 = trunc i16 %arghi to i8
   tail call void asm sideeffect "nop", "~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r11},~{r12},~{r13},~{r14},~{r15}"() nounwind
-  tail call void (i32, i32, ...)* @llvm.experimental.stackmap(i32 14, i32 5, i8 %a0, i8 %a1)
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 14, i32 5, i8 %a0, i8 %a1)
+  ret void
+}
+
+; Map a constant value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_liveConstant
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: SmallConstant
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .byte   8
+; CHECK-NEXT:   .short  0
+; CHECK-NEXT:   .long   33
+
+define void @liveConstant() {
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 15, i32 5, i32 33)
+  ret void
+}
+
+; Directly map an alloca's address.
+;
+; Callsite 16
+; CHECK-LABEL:  .long L{{.*}}-_directFrameIdx
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short	1
+; Loc 0: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+
+; Callsite 17
+; CHECK-LABEL:  .long	L{{.*}}-_directFrameIdx
+; CHECK-NEXT:   .short	0
+; 2 locations
+; CHECK-NEXT:   .short	2
+; Loc 0: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+; Loc 1: Direct RBP - ofs
+; CHECK-NEXT:   .byte	2
+; CHECK-NEXT:   .byte	8
+; CHECK-NEXT:   .short	6
+; CHECK-NEXT:   .long
+define void @directFrameIdx() {
+entry:
+  %metadata1 = alloca i64, i32 3, align 8
+  store i64 11, i64* %metadata1
+  store i64 12, i64* %metadata1
+  store i64 13, i64* %metadata1
+  call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 0, i64* %metadata1)
+  %metadata2 = alloca i8, i32 4, align 8
+  %metadata3 = alloca i16, i32 4, align 8
+  call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 17, i32 5, i8* null, i32 0, i8* %metadata2, i16* %metadata3)
+  ret void
+}
+
+; Test a 64-bit ID.
+;
+; CHECK:        .quad 4294967295
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 4294967296
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad 9223372036854775807
+; CHECK-LABEL:  .long L{{.*}}-_longid
+; CHECK:        .quad -1
+; CHECK-LABEL:  .long L{{.*}}-_longid
+define void @longid() {
+entry:
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967295, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 4294967296, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 9223372036854775807, i32 0, i8* null, i32 0)
+  tail call void (i64, i32, i8*, i32, ...)* @llvm.experimental.patchpoint.void(i64 -1, i32 0, i8* null, i32 0)
+  ret void
+}
+
+; Map a value when R11 is the only free register.
+; The scratch register should not be used for a live stackmap value.
+;
+; CHECK-LABEL:  .long L{{.*}}-_clobberScratch
+; CHECK-NEXT:   .short 0
+; 1 location
+; CHECK-NEXT:   .short 1
+; Loc 0: Indirect fp - offset
+; CHECK-NEXT:   .byte   3
+; CHECK-NEXT:   .byte   4
+; CHECK-NEXT:   .short  6
+; CHECK-NEXT:   .long   -{{[0-9]+}}
+define void @clobberScratch(i32 %a) {
+  tail call void asm sideeffect "nop", "~{ax},~{bx},~{cx},~{dx},~{bp},~{si},~{di},~{r8},~{r9},~{r10},~{r12},~{r13},~{r14},~{r15}"() nounwind
+  tail call void (i64, i32, ...)* @llvm.experimental.stackmap(i64 16, i32 8, i32 %a)
   ret void
 }
 
-declare void @llvm.experimental.stackmap(i32, i32, ...)
-declare void @llvm.experimental.patchpoint.void(i32, i32, i8*, i32, ...)
-declare i64 @llvm.experimental.patchpoint.i64(i32, i32, i8*, i32, ...)
+declare void @llvm.experimental.stackmap(i64, i32, ...)
+declare void @llvm.experimental.patchpoint.void(i64, i32, i8*, i32, ...)
+declare i64 @llvm.experimental.patchpoint.i64(i64, i32, i8*, i32, ...)
diff --git a/test/CodeGen/X86/stdcall-notailcall.ll b/test/CodeGen/X86/stdcall-notailcall.ll
index 8f522cd..448db4c 100644
--- a/test/CodeGen/X86/stdcall-notailcall.ll
+++ b/test/CodeGen/X86/stdcall-notailcall.ll
@@ -4,10 +4,18 @@
 define x86_stdcallcc void @bar(%struct.I* nocapture %this) ssp align 2 {
 ; CHECK-LABEL: bar:
 ; CHECK-NOT: jmp
-; CHECK: ret $4
+; CHECK: retl $4
 entry:
   tail call void @foo()
   ret void
 }
 
+define x86_thiscallcc void @test2(%struct.I*  %this, i32 %a) {
+; CHECK-LABEL: test2:
+; CHECK: calll _foo
+; CHECK: retl $4
+  tail call void @foo()
+  ret void
+}
+
 declare void @foo()
diff --git a/test/CodeGen/X86/stdcall.ll b/test/CodeGen/X86/stdcall.ll
index 73826ed..3cefe14 100644
--- a/test/CodeGen/X86/stdcall.ll
+++ b/test/CodeGen/X86/stdcall.ll
@@ -6,14 +6,14 @@
 define internal x86_stdcallcc void @MyFunc() nounwind {
 entry:
 ; CHECK: MyFunc@0:
-; CHECK: ret
+; CHECK: retl
   ret void
 }
 
 ; PR14410
 define x86_stdcallcc i32 @"\01DoNotMangle"(i32 %a) {
 ; CHECK: DoNotMangle:
-; CHECK: ret $4
+; CHECK: retl $4
 entry:
   ret i32 %a
 }
diff --git a/test/CodeGen/X86/stores-merging.ll b/test/CodeGen/X86/stores-merging.ll
new file mode 100644
index 0000000..61dea08
--- /dev/null
+++ b/test/CodeGen/X86/stores-merging.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s | FileCheck %s
+
+target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%structTy = type { i8, i32, i32 }
+
+@e = common global %structTy zeroinitializer, align 4
+
+; CHECK-LABEL: f
+define void @f() {
+entry:
+
+; CHECK:   movabsq	$528280977409, %rax
+; CHECK:   movq    %rax, e+4(%rip)
+; CHECK:   movl    $456, e+8(%rip)
+
+  store i32 1, i32* getelementptr inbounds (%structTy* @e, i64 0, i32 1), align 4
+  store i32 123, i32* getelementptr inbounds (%structTy* @e, i64 0, i32 2), align 4
+  store i32 456, i32* getelementptr inbounds (%structTy* @e, i64 0, i32 2), align 4
+  ret void
+}
+
diff --git a/test/CodeGen/X86/sunkaddr-ext.ll b/test/CodeGen/X86/sunkaddr-ext.ll
new file mode 100644
index 0000000..6d23867
--- /dev/null
+++ b/test/CodeGen/X86/sunkaddr-ext.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+
+; Test to make sure that if math that can roll over has been used we don't
+; use the potential overflow as the basis for an address calculation later by
+; sinking it into a different basic block.
+
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @test_sink(i8* %arg1, i32 %arg2, i8 %arg3) #0 {
+  %tmp1 = add i32 -2147483648, %arg2
+  %tmp2 = add i32 -2147483648, %tmp1
+  %tmp3 = getelementptr i8* %arg1, i32 %arg2
+  br label %bb1
+
+bb1:
+  %tmp4 = getelementptr i8* %arg1, i32 %tmp2
+  store i8 %arg3, i8* %tmp4
+  ret void;
+}
+
+; CHECK-LABEL: test_sink:
+; CHECK:   movslq  %esi, [[TEMP:%[a-z0-9]+]]
+; CHECK:   movb    %dl, (%rdi,[[TEMP]])
+; CHECK:   retq
diff --git a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
index 1bc6175..1beee72 100644
--- a/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/tbm-intrinsics-x86_64.ll
@@ -34,7 +34,7 @@ declare i64 @llvm.x86.tbm.bextri.u64(i64, i64) nounwind readnone
 
 define i64 @test_x86_tbm_bextri_u64_m(i64* nocapture %a) nounwind readonly {
 entry:
-  ; CHECK-LABEl: test_x86_tbm_bextri_u64_m:
+  ; CHECK-LABEL: test_x86_tbm_bextri_u64_m:
   ; CHECK-NOT: mov
   ; CHECK: bextr $
   %tmp1 = load i64* %a, align 8
diff --git a/test/CodeGen/X86/v2f32.ll b/test/CodeGen/X86/v2f32.ll
index f2bebf5..dab5e7b 100644
--- a/test/CodeGen/X86/v2f32.ll
+++ b/test/CodeGen/X86/v2f32.ll
@@ -24,9 +24,9 @@ define void @test1(<2 x float> %Q, float *%P2) nounwind {
 ; W64-NEXT: ret
 
 ; X32-LABEL: test1:
+; X32-NEXT: movl	4(%esp), %eax
 ; X32-NEXT: pshufd	$1, %xmm0, %xmm1
 ; X32-NEXT: addss	%xmm0, %xmm1
-; X32-NEXT: movl	4(%esp), %eax
 ; X32-NEXT: movss	%xmm1, (%eax)
 ; X32-NEXT: ret
 }
diff --git a/test/CodeGen/X86/v4i32load-crash.ll b/test/CodeGen/X86/v4i32load-crash.ll
index 052c4c3..3e7f9e6 100644
--- a/test/CodeGen/X86/v4i32load-crash.ll
+++ b/test/CodeGen/X86/v4i32load-crash.ll
@@ -1,10 +1,11 @@
-; RUN: llc --mcpu=x86-64 --mattr=ssse3 < %s
+; RUN: llc --march=x86 --mcpu=x86-64 --mattr=ssse3 < %s
+; RUN: llc --march=x86-64 --mcpu=x86-64 --mattr=ssse3 < %s
 
 ;PR18045:
 ;Issue of selection for 'v4i32 load'.
 ;This instruction is not legal for X86 CPUs with sse < 'sse4.1'.
 ;This node was generated by X86ISelLowering.cpp, EltsFromConsecutiveLoads
-;static function after legilize stage.
+;static function after legalize stage.
 
 @e = external global [4 x i32], align 4
 @f = external global [4 x i32], align 4
diff --git a/test/CodeGen/X86/vaargs.ll b/test/CodeGen/X86/vaargs.ll
new file mode 100644
index 0000000..ddeb7a3
--- /dev/null
+++ b/test/CodeGen/X86/vaargs.ll
@@ -0,0 +1,67 @@
+; RUN: llc -mcpu=corei7-avx %s -o - | FileCheck %s --check-prefix=CHECK --check-prefix=NO-FLAGS
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+%struct.__va_list_tag = type { i32, i32, i8*, i8* }
+
+; Check that vastart gets the right thing.
+define i32 @sum(i32 %count, ...) nounwind optsize ssp uwtable {
+; CHECK:      testb   %al, %al
+; CHECK-NEXT: je
+; CHECK-NEXT: ## BB#{{[0-9]+}}:
+; CHECK-NEXT: vmovaps %xmm0, 48(%rsp)
+; CHECK-NEXT: vmovaps %xmm1, 64(%rsp)
+; CHECK-NEXT: vmovaps %xmm2, 80(%rsp)
+; CHECK-NEXT: vmovaps %xmm3, 96(%rsp)
+; CHECK-NEXT: vmovaps %xmm4, 112(%rsp)
+; CHECK-NEXT: vmovaps %xmm5, 128(%rsp)
+; CHECK-NEXT: vmovaps %xmm6, 144(%rsp)
+; CHECK-NEXT: vmovaps %xmm7, 160(%rsp)
+
+; Check that [EFLAGS] hasn't been pulled in.
+; NO-FLAGS-NOT: %flags
+
+  %ap = alloca [1 x %struct.__va_list_tag], align 16
+  %1 = bitcast [1 x %struct.__va_list_tag]* %ap to i8*
+  call void @llvm.va_start(i8* %1)
+  %2 = icmp sgt i32 %count, 0
+  br i1 %2, label %.lr.ph, label %._crit_edge
+
+.lr.ph:                                           ; preds = %0
+  %3 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 0
+  %4 = getelementptr inbounds [1 x %struct.__va_list_tag]* %ap, i64 0, i64 0, i32 2
+  %.pre = load i32* %3, align 16
+  br label %5
+
+; <label>:5                                       ; preds = %.lr.ph, %13
+  %6 = phi i32 [ %.pre, %.lr.ph ], [ %14, %13 ]
+  %.01 = phi i32 [ %count, %.lr.ph ], [ %15, %13 ]
+  %7 = icmp ult i32 %6, 41
+  br i1 %7, label %8, label %10
+
+; <label>:8                                       ; preds = %5
+  %9 = add i32 %6, 8
+  store i32 %9, i32* %3, align 16
+  br label %13
+
+; <label>:10                                      ; preds = %5
+  %11 = load i8** %4, align 8
+  %12 = getelementptr i8* %11, i64 8
+  store i8* %12, i8** %4, align 8
+  br label %13
+
+; <label>:13                                      ; preds = %10, %8
+  %14 = phi i32 [ %6, %10 ], [ %9, %8 ]
+  %15 = add nsw i32 %.01, 1
+  %16 = icmp sgt i32 %15, 0
+  br i1 %16, label %5, label %._crit_edge
+
+._crit_edge:                                      ; preds = %13, %0
+  %.0.lcssa = phi i32 [ %count, %0 ], [ %15, %13 ]
+  call void @llvm.va_end(i8* %1)
+  ret i32 %.0.lcssa
+}
+
+declare void @llvm.va_start(i8*) nounwind
+
+declare void @llvm.va_end(i8*) nounwind
diff --git a/test/CodeGen/X86/vastart-defs-eflags.ll b/test/CodeGen/X86/vastart-defs-eflags.ll
new file mode 100644
index 0000000..6017753
--- /dev/null
+++ b/test/CodeGen/X86/vastart-defs-eflags.ll
@@ -0,0 +1,23 @@
+; RUN: llc %s -o - | FileCheck %s
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.10.0"
+
+; Check that vastart handling doesn't get between testb and je for the branch.
+define i32 @check_flag(i32 %flags, ...) nounwind {
+entry:
+; CHECK: {{^}} testb $2, %bh
+; CHECK-NOT: test
+; CHECK: {{^}} je
+  %and = and i32 %flags, 512
+  %tobool = icmp eq i32 %and, 0
+  br i1 %tobool, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  br label %if.end
+
+if.end:                                           ; preds = %entry, %if.then
+  %hasflag = phi i32 [ 1, %if.then ], [ 0, %entry ]
+  ret i32 %hasflag
+}
+
diff --git a/test/CodeGen/X86/vbinop-simplify-bug.ll b/test/CodeGen/X86/vbinop-simplify-bug.ll
new file mode 100644
index 0000000..3a89cd7
--- /dev/null
+++ b/test/CodeGen/X86/vbinop-simplify-bug.ll
@@ -0,0 +1,23 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mattr=sse2 -mcpu=corei7 -o /dev/null
+
+; Revision 199135 introduced a wrong check in method
+; DAGCombiner::SimplifyVBinOp in an attempt to refactor some code
+; using the new method 'BuildVectorSDNode::isConstant' when possible.
+; 
+; However the modified code in method SimplifyVBinOp now wrongly
+; checks that the operands of a vector bin-op are both constants.
+;
+; With that wrong change, this test started failing because of a
+; 'fatal error in the backend':
+;   Cannot select: 0x2e329d0: v4i32 = BUILD_VECTOR 0x2e2ea00, 0x2e2ea00, 0x2e2ea00, 0x2e2ea00
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+;       0x2e2ea00: i32 = Constant<1> [ID=4]
+
+define <8 x i32> @reduced_test_case() {
+  %Shuff = shufflevector <8 x i32> zeroinitializer, <8 x i32> zeroinitializer, <8 x i32> <i32 1, i32 3, i32 undef, i32 7, i32 9, i32 11, i32 13, i32 15>
+  %B23 = sub <8 x i32> %Shuff, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  ret <8 x i32> %B23
+}
+
diff --git a/test/CodeGen/X86/vec_round.ll b/test/CodeGen/X86/vec_round.ll
index baa2f58..9258f9e 100644
--- a/test/CodeGen/X86/vec_round.ll
+++ b/test/CodeGen/X86/vec_round.ll
@@ -5,7 +5,7 @@ target triple = "x86_64-unknown-linux-gnu"
 declare void @use(<2 x double>)
 
 ; CHECK-LABEL: @test
-; CHECK callq round
+; CHECK: callq round
 
 ; Function Attrs: nounwind uwtable
 define void @test() {
diff --git a/test/CodeGen/X86/vec_setcc-2.ll b/test/CodeGen/X86/vec_setcc-2.ll
new file mode 100644
index 0000000..ef916dc
--- /dev/null
+++ b/test/CodeGen/X86/vec_setcc-2.ll
@@ -0,0 +1,96 @@
+; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -o - -mcpu=generic -mtriple=x86_64-apple-darwin -mattr=+sse4.2 | FileCheck %s
+
+; For a setult against a constant, turn it into a setule and lower via psubusw.
+
+define void @loop_no_const_reload(<2 x i64>*  %in, <2 x i64>* %out, i32 %n) {
+; CHECK: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-NEXT: .short 25
+; CHECK-LABEL: loop_no_const_reload:
+; CHECK: psubusw
+
+; Constant is no longer clobbered so no need to reload it in the loop.
+
+; CHECK-NOT: movdqa {{%xmm[0-9]+}}, {{%xmm[0-9]+}}
+
+entry:
+  %cmp9 = icmp eq i32 %n, 0
+  br i1 %cmp9, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %in, i64 %indvars.iv
+  %arrayidx1.val = load <2 x i64>* %arrayidx1, align 16
+  %0 = bitcast <2 x i64> %arrayidx1.val to <8 x i16>
+  %cmp.i.i = icmp ult <8 x i16> %0, <i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26>
+  %sext.i.i = sext <8 x i1> %cmp.i.i to <8 x i16>
+  %1 = bitcast <8 x i16> %sext.i.i to <2 x i64>
+  %arrayidx5 = getelementptr inbounds <2 x i64>* %out, i64 %indvars.iv
+  store <2 x i64> %1, <2 x i64>* %arrayidx5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Be careful if decrementing the constant would undeflow.
+
+define void @loop_const_folding_underflow(<2 x i64>*  %in, <2 x i64>* %out, i32 %n) {
+; CHECK-NOT: .short 25
+; CHECK-LABEL: loop_const_folding_underflow:
+; CHECK-NOT: psubusw
+entry:
+  %cmp9 = icmp eq i32 %n, 0
+  br i1 %cmp9, label %for.end, label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds <2 x i64>* %in, i64 %indvars.iv
+  %arrayidx1.val = load <2 x i64>* %arrayidx1, align 16
+  %0 = bitcast <2 x i64> %arrayidx1.val to <8 x i16>
+  %cmp.i.i = icmp ult <8 x i16> %0, <i16 0, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26, i16 26>
+  %sext.i.i = sext <8 x i1> %cmp.i.i to <8 x i16>
+  %1 = bitcast <8 x i16> %sext.i.i to <2 x i64>
+  %arrayidx5 = getelementptr inbounds <2 x i64>* %out, i64 %indvars.iv
+  store <2 x i64> %1, <2 x i64>* %arrayidx5, align 16
+  %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+  %exitcond = icmp eq i32 %lftr.wideiv, %n
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+; Test for PSUBUSB
+
+define <16 x i8> @test_ult_byte(<16 x i8> %a) {
+; CHECK: .space 16,10
+; CHECK-LABEL: test_ult_byte:
+; CHECK: psubus
+entry:
+  %icmp = icmp ult <16 x i8> %a, <i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11, i8 11>
+  %sext = sext <16 x i1> %icmp to <16 x i8>
+  ret <16 x i8> %sext
+}
+
+; Only do this when we can turn the comparison into a setule.  I.e. not for
+; register operands.
+
+define <8 x i16> @test_ult_register(<8 x i16> %a, <8 x i16> %b) {
+; CHECK-LABEL: test_ult_register:
+; CHECK-NOT: psubus
+entry:
+  %icmp = icmp ult <8 x i16> %a, %b
+  %sext = sext <8 x i1> %icmp to <8 x i16>
+  ret <8 x i16> %sext
+}
diff --git a/test/CodeGen/X86/vec_setcc.ll b/test/CodeGen/X86/vec_setcc.ll
index fc8a56d..322dbae 100644
--- a/test/CodeGen/X86/vec_setcc.ll
+++ b/test/CodeGen/X86/vec_setcc.ll
@@ -42,12 +42,9 @@ define <8 x i16> @v8i16_icmp_uge(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
   %2 = sext <8 x i1> %1 to <8 x i16>
   ret <8 x i16> %2
 ; SSE2-LABEL: v8i16_icmp_uge:
-; SSE2: movdqa  {{.*}}(%rip), %xmm2
-; SEE2: pxor    %xmm2, %xmm0
-; SSE2: pxor    %xmm1, %xmm2
-; SSE2: pcmpgtw %xmm0, %xmm2
-; SSE2: pcmpeqd %xmm0, %xmm0
-; SSE2: pxor    %xmm2, %xmm0
+; SSE2: psubusw %xmm0, %xmm1
+; SEE2: pxor    %xmm0, %xmm0
+; SSE2: pcmpeqw %xmm1, %xmm0
 
 ; SSE41-LABEL: v8i16_icmp_uge:
 ; SSE41: pmaxuw  %xmm0, %xmm1
@@ -63,12 +60,9 @@ define <8 x i16> @v8i16_icmp_ule(<8 x i16> %a, <8 x i16> %b) nounwind readnone s
   %2 = sext <8 x i1> %1 to <8 x i16>
   ret <8 x i16> %2
 ; SSE2-LABEL: v8i16_icmp_ule:
-; SSE2: movdqa  {{.*}}(%rip), %xmm2
-; SSE2: pxor    %xmm2, %xmm1
-; SSE2: pxor    %xmm2, %xmm0
-; SSE2: pcmpgtw %xmm1, %xmm0
-; SSE2: pcmpeqd %xmm1, %xmm1
-; SSE2: pxor    %xmm0, %xmm1
+; SSE2: psubusw %xmm1, %xmm0
+; SSE2: pxor    %xmm1, %xmm1
+; SSE2: pcmpeqw %xmm0, %xmm1
 ; SSE2: movdqa  %xmm1, %xmm0
 
 ; SSE41-LABEL: v8i16_icmp_ule:
diff --git a/test/CodeGen/X86/vec_shift4.ll b/test/CodeGen/X86/vec_shift4.ll
index e2fe45c..b266a69 100644
--- a/test/CodeGen/X86/vec_shift4.ll
+++ b/test/CodeGen/X86/vec_shift4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse4.1 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=corei7 | FileCheck %s
 
 define <2 x i64> @shl1(<4 x i32> %r, <4 x i32> %a) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/vec_shift5.ll b/test/CodeGen/X86/vec_shift5.ll
new file mode 100644
index 0000000..2e98003
--- /dev/null
+++ b/test/CodeGen/X86/vec_shift5.ll
@@ -0,0 +1,160 @@
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we correctly fold target specific packed vector shifts by
+; immediate count into a simple build_vector when the elements of the vector
+; in input to the packed shift are all constants or undef.
+
+define <8 x i16> @test1() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16> <i16 1, i16 2, i16 4, i16 8, i16 1, i16 2, i16 4, i16 8>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test2() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test3() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 4, i16 8, i16 16, i16 32, i16 4, i16 8, i16 16, i16 32>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test4() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 1, i32 2, i32 4, i32 8>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test5() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test6() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 4, i32 8, i32 16, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test7() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 1, i64 2>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test8() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 8, i64 16>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test9() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test10() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test11() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test12() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: psra
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test13() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <8 x i16> @test14() {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16> <i16 15, i16 8, i16 undef, i16 undef, i16 31, i16 undef, i16 64, i16 128>, i32 3)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: psrl
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <4 x i32> @test15() {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32> <i32 undef, i32 8, i32 undef, i32 32>, i32 3)
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+define <2 x i64> @test16() {
+  %1 = tail call <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64> <i64 undef, i64 31>, i32 3)
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test16
+; CHECK-NOT: psll
+; CHECK: movaps
+; CHECK-NEXT: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.pslli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrli.w(<8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.pslli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrli.d(<4 x i32>, i32)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+declare <2 x i64> @llvm.x86.sse2.pslli.q(<2 x i64>, i32)
+declare <2 x i64> @llvm.x86.sse2.psrli.q(<2 x i64>, i32)
+
diff --git a/test/CodeGen/X86/vec_shift6.ll b/test/CodeGen/X86/vec_shift6.ll
new file mode 100644
index 0000000..df2d9cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_shift6.ll
@@ -0,0 +1,134 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 | FileCheck %s -check-prefix=CHECK -check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX2ONLY
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=knl | FileCheck %s -check-prefix=CHECK -check-prefix=AVX2 -check-prefix=AVX512
+
+
+; Verify that we don't scalarize a packed vector shift left of 16-bit
+; signed integers if the amount is a constant build_vector.
+; Check that we produce a SSE2 packed integer multiply (pmullw) instead.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <8 x i16> %shl
+}
+; CHECK-LABEL: test1
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %shl = shl <8 x i16> %a, <i16 0, i16 undef, i16 0, i16 0, i16 1, i16 undef, i16 -1, i16 1>
+  ret <8 x i16> %shl
+}
+; CHECK-LABEL: test2
+; CHECK: pmullw
+; CHECK-NEXT: ret
+
+
+; Verify that a vector shift left of 32-bit signed integers is simply expanded
+; into a SSE4.1 pmulld (instead of cvttps2dq + pmulld) if the vector of shift
+; counts is a constant build_vector.
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 1, i32 -1, i32 2, i32 -3>
+  ret <4 x i32> %shl
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: cvttps2dq
+; SSE: pmulld
+; AVX2: vpsllvd
+; CHECK-NEXT: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %shl = shl <4 x i32> %a, <i32 0, i32 0, i32 1, i32 1>
+  ret <4 x i32> %shl
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: cvttps2dq
+; SSE: pmulld
+; AVX2: vpsllvd
+; CHECK-NEXT: ret
+
+
+; If we have AVX/SSE2 but not AVX2, verify that the following shift is split
+; into two pmullw instructions. With AVX2, the test case below would produce
+; a single vpmullw.
+
+define <16 x i16> @test5(<16 x i16> %a) {
+  %shl = shl <16 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <16 x i16> %shl
+}
+; CHECK-LABEL: test5
+; SSE: pmullw
+; SSE-NEXT: pmullw
+; AVX2: vpmullw
+; AVX2-NOT: vpmullw
+; CHECK: ret
+
+
+; If we have AVX/SSE4.1 but not AVX2, verify that the following shift is split
+; into two pmulld instructions. With AVX2, the test case below would produce
+; a single vpsllvd instead.
+
+define <8 x i32> @test6(<8 x i32> %a) {
+  %shl = shl <8 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
+  ret <8 x i32> %shl
+}
+; CHECK-LABEL: test6
+; SSE: pmulld
+; SSE-NEXT: pmulld
+; AVX2: vpsllvd
+; CHECK: ret
+
+
+; With AVX2 and AVX512, the test case below should produce a sequence of
+; two vpmullw instructions. On SSE2 instead, we split the shift in four
+; parts and then we convert each part into a pmullw.
+
+define <32 x i16> @test7(<32 x i16> %a) {
+  %shl = shl <32 x i16> %a, <i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11, i16 1, i16 1, i16 2, i16 3, i16 7, i16 0, i16 9, i16 11>
+  ret <32 x i16> %shl
+}
+; CHECK-LABEL: test7
+; SSE: pmullw
+; SSE-NEXT: pmullw
+; SSE-NEXT: pmullw
+; SSE-NEXT: pmullw
+; AVX2: vpmullw
+; AVX2-NEXT: vpmullw
+; CHECK: ret
+
+
+; Similar to test7; the difference is that with AVX512 support
+; we only produce a single vpsllvd/vpsllvq instead of a pair of vpsllvd/vpsllvq.
+
+define <16 x i32> @test8(<16 x i32> %a) {
+  %shl = shl <16 x i32> %a, <i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3, i32 1, i32 1, i32 2, i32 3>
+  ret <16 x i32> %shl
+}
+; CHECK-LABEL: test8
+; SSE: pmulld
+; SSE-NEXT: pmulld
+; SSE-NEXT: pmulld
+; SSE-NEXT: pmulld
+; AVX2ONLY: vpsllvd
+; AVX2ONLY-NEXT: vpsllvd
+; AVX512: vpsllvd
+; AVX512-NOT: vpsllvd
+; CHECK: ret
+
+
+; The shift from 'test9' gets scalarized if we don't have AVX2/AVX512f support.
+
+define <8 x i64> @test9(<8 x i64> %a) {
+  %shl = shl <8 x i64> %a, <i64 1, i64 1, i64 2, i64 3, i64 1, i64 1, i64 2, i64 3>
+  ret <8 x i64> %shl
+}
+; CHECK-LABEL: test9
+; AVX2ONLY: vpsllvq
+; AVX2ONLY-NEXT: vpsllvq
+; AVX512: vpsllvq
+; AVX512-NOT: vpsllvq
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/vec_shuf-insert.ll b/test/CodeGen/X86/vec_shuf-insert.ll
new file mode 100644
index 0000000..2e1a1d6
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuf-insert.ll
@@ -0,0 +1,29 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux -mcpu=corei7-avx | FileCheck %s
+
+; These tests check that an insert_subvector which replaces one of the halves
+; of a concat_vectors is optimized into a single vinsertf128.
+
+
+declare <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float>, <4 x float>, i8)
+
+define <8 x float> @lower_half(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) {
+  %1 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %1, <4 x float> %v3, i8 0)
+  ret <8 x float> %2
+
+; CHECK-LABEL: lower_half
+; CHECK-NOT: vinsertf128
+; CHECK: vinsertf128 $1, %xmm1, %ymm2, %ymm0
+; CHECK-NEXT: ret
+}
+
+define <8 x float> @upper_half(<4 x float> %v1, <4 x float> %v2, <4 x float> %v3) {
+  %1 = shufflevector <4 x float> %v1, <4 x float> %v2, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %2 = tail call <8 x float> @llvm.x86.avx.vinsertf128.ps.256(<8 x float> %1, <4 x float> %v3, i8 1)
+  ret <8 x float> %2
+
+; CHECK-LABEL: upper_half
+; CHECK-NOT: vinsertf128
+; CHECK: vinsertf128 $1, %xmm2, %ymm0, %ymm0
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/vec_shuffle-40.ll b/test/CodeGen/X86/vec_shuffle-40.ll
new file mode 100644
index 0000000..75b45e3
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-40.ll
@@ -0,0 +1,22 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+define void @shuffle_v16i16(<16 x i16>* %a) {
+; CHECK-LABEL: shuffle_v16i16:
+; CHECK: vpshufb {{.*}}%ymm
+; CHECK-NOT: vpshufb {{.*}}%xmm
+entry:
+  %0 = load <16 x i16>* %a, align 32
+  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 5, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
+  ret void
+}
+
+define void @shuffle_v16i16_lanecrossing(<16 x i16>* %a) {
+; CHECK-LABEL: shuffle_v16i16_lanecrossing:
+; CHECK-NOT: vpshufb {{.*}}%ymm
+entry:
+  %0 = load <16 x i16>* %a, align 32
+  %shuffle = shufflevector <16 x i16> %0, <16 x i16> undef, <16 x i32> <i32 1, i32 1, i32 3, i32 3, i32 5, i32 13, i32 7, i32 7, i32 9, i32 9, i32 11, i32 11, i32 13, i32 13, i32 15, i32 15>
+  store <16 x i16> %shuffle, <16 x i16>* %a, align 32
+  ret void
+}
diff --git a/test/CodeGen/X86/vector-gep.ll b/test/CodeGen/X86/vector-gep.ll
index b87d844..9c68f44 100644
--- a/test/CodeGen/X86/vector-gep.ll
+++ b/test/CodeGen/X86/vector-gep.ll
@@ -4,22 +4,26 @@
 ;CHECK-LABEL: AGEP0:
 define <4 x i32*> @AGEP0(i32* %ptr) nounwind {
 entry:
+;CHECK-LABEL: AGEP0
+;CHECK: vbroadcast
+;CHECK-NEXT: vpaddd
+;CHECK-NEXT: ret
   %vecinit.i = insertelement <4 x i32*> undef, i32* %ptr, i32 0
   %vecinit2.i = insertelement <4 x i32*> %vecinit.i, i32* %ptr, i32 1
   %vecinit4.i = insertelement <4 x i32*> %vecinit2.i, i32* %ptr, i32 2
   %vecinit6.i = insertelement <4 x i32*> %vecinit4.i, i32* %ptr, i32 3
-;CHECK: padd
   %A2 = getelementptr <4 x i32*> %vecinit6.i, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
-;CHECK: padd
   %A3 = getelementptr <4 x i32*> %A2, <4 x i32> <i32 10, i32 14, i32 19, i32 233>
   ret <4 x i32*> %A3
-;CHECK: ret
 }
 
 ;CHECK-LABEL: AGEP1:
 define i32 @AGEP1(<4 x i32*> %param) nounwind {
 entry:
-;CHECK: padd
+;CHECK-LABEL: AGEP1
+;CHECK: vpaddd
+;CHECK-NEXT: vpextrd
+;CHECK-NEXT: movl
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> <i32 1, i32 2, i32 3, i32 4>
   %k = extractelement <4 x i32*> %A2, i32 3
   %v = load i32* %k
@@ -30,8 +34,9 @@ entry:
 ;CHECK-LABEL: AGEP2:
 define i32 @AGEP2(<4 x i32*> %param, <4 x i32> %off) nounwind {
 entry:
-;CHECK: pslld $2
-;CHECK: padd
+;CHECK-LABEL: AGEP2
+;CHECK: vpslld $2
+;CHECK-NEXT: vpadd
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> %off
   %k = extractelement <4 x i32*> %A2, i32 3
   %v = load i32* %k
@@ -42,8 +47,9 @@ entry:
 ;CHECK-LABEL: AGEP3:
 define <4 x i32*> @AGEP3(<4 x i32*> %param, <4 x i32> %off) nounwind {
 entry:
-;CHECK: pslld $2
-;CHECK: padd
+;CHECK-LABEL: AGEP3
+;CHECK: vpslld $2
+;CHECK-NEXT: vpadd
   %A2 = getelementptr <4 x i32*> %param, <4 x i32> %off
   %v = alloca i32
   %k = insertelement <4 x i32*> %A2, i32* %v, i32 3
@@ -54,10 +60,11 @@ entry:
 ;CHECK-LABEL: AGEP4:
 define <4 x i16*> @AGEP4(<4 x i16*> %param, <4 x i32> %off) nounwind {
 entry:
+;CHECK-LABEL: AGEP4
 ; Multiply offset by two (add it to itself).
-;CHECK: padd
+;CHECK: vpadd
 ; add the base to the offset
-;CHECK: padd
+;CHECK-NEXT: vpadd
   %A = getelementptr <4 x i16*> %param, <4 x i32> %off
   ret <4 x i16*> %A
 ;CHECK: ret
@@ -66,7 +73,8 @@ entry:
 ;CHECK-LABEL: AGEP5:
 define <4 x i8*> @AGEP5(<4 x i8*> %param, <4 x i8> %off) nounwind {
 entry:
-;CHECK: paddd
+;CHECK-LABEL: AGEP5
+;CHECK: vpaddd
   %A = getelementptr <4 x i8*> %param, <4 x i8> %off
   ret <4 x i8*> %A
 ;CHECK: ret
@@ -77,6 +85,7 @@ entry:
 ;CHECK-LABEL: AGEP6:
 define <4 x i8*> @AGEP6(<4 x i8*> %param, <4 x i32> %off) nounwind {
 entry:
+;CHECK-LABEL: AGEP6
 ;CHECK-NOT: pslld
   %A = getelementptr <4 x i8*> %param, <4 x i32> %off
   ret <4 x i8*> %A
diff --git a/test/CodeGen/X86/viabs.ll b/test/CodeGen/X86/viabs.ll
index 0be00da..d9f2cb0 100644
--- a/test/CodeGen/X86/viabs.ll
+++ b/test/CodeGen/X86/viabs.ll
@@ -1,6 +1,7 @@
 ; RUN: llc < %s -march=x86-64 -mcpu=x86-64 | FileCheck %s -check-prefix=SSE2
 ; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s -check-prefix=SSSE3
 ; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s -check-prefix=AVX2
+; RUN: llc < %s -march=x86-64 -mcpu=knl | FileCheck %s -check-prefix=AVX512
 
 define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; SSE2-LABEL: test1:
@@ -17,6 +18,10 @@ define <4 x i32> @test1(<4 x i32> %a) nounwind {
 ; AVX2-LABEL: test1:
 ; AVX2: vpabsd
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test1:
+; AVX512: vpabsd
+; AVX512-NEXT: ret
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sgt <4 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1>
         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -38,6 +43,10 @@ define <4 x i32> @test2(<4 x i32> %a) nounwind {
 ; AVX2-LABEL: test2:
 ; AVX2: vpabsd
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test2:
+; AVX512: vpabsd
+; AVX512-NEXT: ret
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sge <4 x i32> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i32> %a, <4 x i32> %tmp1neg
@@ -59,6 +68,10 @@ define <8 x i16> @test3(<8 x i16> %a) nounwind {
 ; AVX2-LABEL: test3:
 ; AVX2: vpabsw
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test3:
+; AVX512: vpabsw
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i16> zeroinitializer, %a
         %b = icmp sgt <8 x i16> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i16> %a, <8 x i16> %tmp1neg
@@ -80,6 +93,10 @@ define <16 x i8> @test4(<16 x i8> %a) nounwind {
 ; AVX2-LABEL: test4:
 ; AVX2: vpabsb
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test4:
+; AVX512: vpabsb
+; AVX512-NEXT: ret
         %tmp1neg = sub <16 x i8> zeroinitializer, %a
         %b = icmp slt <16 x i8> %a, zeroinitializer
         %abs = select <16 x i1> %b, <16 x i8> %tmp1neg, <16 x i8> %a
@@ -101,6 +118,10 @@ define <4 x i32> @test5(<4 x i32> %a) nounwind {
 ; AVX2-LABEL: test5:
 ; AVX2: vpabsd
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test5:
+; AVX512: vpabsd
+; AVX512-NEXT: ret
         %tmp1neg = sub <4 x i32> zeroinitializer, %a
         %b = icmp sle <4 x i32> %a, zeroinitializer
         %abs = select <4 x i1> %b, <4 x i32> %tmp1neg, <4 x i32> %a
@@ -116,6 +137,10 @@ define <8 x i32> @test6(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test6:
 ; AVX2: vpabsd {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test6:
+; AVX512: vpabsd {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i32> zeroinitializer, %a
         %b = icmp sgt <8 x i32> %a, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
         %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
@@ -131,6 +156,10 @@ define <8 x i32> @test7(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test7:
 ; AVX2: vpabsd {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test7:
+; AVX512: vpabsd {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i32> zeroinitializer, %a
         %b = icmp sge <8 x i32> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i32> %a, <8 x i32> %tmp1neg
@@ -146,6 +175,10 @@ define <16 x i16> @test8(<16 x i16> %a) nounwind {
 ; AVX2-LABEL: test8:
 ; AVX2: vpabsw {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test8:
+; AVX512: vpabsw {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <16 x i16> zeroinitializer, %a
         %b = icmp sgt <16 x i16> %a, zeroinitializer
         %abs = select <16 x i1> %b, <16 x i16> %a, <16 x i16> %tmp1neg
@@ -161,6 +194,10 @@ define <32 x i8> @test9(<32 x i8> %a) nounwind {
 ; AVX2-LABEL: test9:
 ; AVX2: vpabsb {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test9:
+; AVX512: vpabsb {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <32 x i8> zeroinitializer, %a
         %b = icmp slt <32 x i8> %a, zeroinitializer
         %abs = select <32 x i1> %b, <32 x i8> %tmp1neg, <32 x i8> %a
@@ -176,8 +213,58 @@ define <8 x i32> @test10(<8 x i32> %a) nounwind {
 ; AVX2-LABEL: test10:
 ; AVX2: vpabsd {{.*}}%ymm
 ; AVX2-NEXT: ret
+
+; AVX512-LABEL: test10:
+; AVX512: vpabsd {{.*}}%ymm
+; AVX512-NEXT: ret
         %tmp1neg = sub <8 x i32> zeroinitializer, %a
         %b = icmp sle <8 x i32> %a, zeroinitializer
         %abs = select <8 x i1> %b, <8 x i32> %tmp1neg, <8 x i32> %a
         ret <8 x i32> %abs
 }
+
+define <16 x i32> @test11(<16 x i32> %a) nounwind {
+; AVX2-LABEL: test11:
+; AVX2: vpabsd
+; AVX2: vpabsd
+; AVX2-NEXT: ret
+
+; AVX512-LABEL: test11:
+; AVX512: vpabsd {{.*}}%zmm
+; AVX512-NEXT: ret
+        %tmp1neg = sub <16 x i32> zeroinitializer, %a
+        %b = icmp sle <16 x i32> %a, zeroinitializer
+        %abs = select <16 x i1> %b, <16 x i32> %tmp1neg, <16 x i32> %a
+        ret <16 x i32> %abs
+}
+
+define <8 x i64> @test12(<8 x i64> %a) nounwind {
+; AVX2-LABEL: test12:
+; AVX2: vpxor
+; AVX2: vpxor
+; AVX2-NEXT: ret
+
+; AVX512-LABEL: test12:
+; AVX512: vpabsq {{.*}}%zmm
+; AVX512-NEXT: ret
+        %tmp1neg = sub <8 x i64> zeroinitializer, %a
+        %b = icmp sle <8 x i64> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+        ret <8 x i64> %abs
+}
+
+define <8 x i64> @test13(<8 x i64>* %a.ptr) nounwind {
+; AVX2-LABEL: test13:
+; AVX2: vpxor
+; AVX2: vpxor
+; AVX2-NEXT: ret
+
+; AVX512-LABEL: test13:
+; AVX512: vpabsq (%
+; AVX512-NEXT: ret
+        %a = load <8 x i64>* %a.ptr, align 8
+        %tmp1neg = sub <8 x i64> zeroinitializer, %a
+        %b = icmp sle <8 x i64> %a, zeroinitializer
+        %abs = select <8 x i1> %b, <8 x i64> %tmp1neg, <8 x i64> %a
+        ret <8 x i64> %abs
+}
diff --git a/test/CodeGen/X86/vselect-2.ll b/test/CodeGen/X86/vselect-2.ll
new file mode 100644
index 0000000..50da32c
--- /dev/null
+++ b/test/CodeGen/X86/vselect-2.ll
@@ -0,0 +1,33 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7 -mattr=sse2 | FileCheck %s
+
+define <4 x i32> @test1(<4 x i32> %A, <4 x i32> %B) {
+  %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x i32> %A, <4 x i32> %B
+  ret <4 x i32> %select
+}
+; CHECK-LABEL: test1
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x i32> @test2(<4 x i32> %A, <4 x i32> %B) {
+  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x i32> %A, <4 x i32> %B
+  ret <4 x i32> %select
+}
+; CHECK-LABEL: test2
+; CHECK: movsd
+; CHECK-NEXT: ret
+
+define <4 x float> @test3(<4 x float> %A, <4 x float> %B) {
+  %select = select <4 x i1><i1 true, i1 true, i1 false, i1 false>, <4 x float> %A, <4 x float> %B
+  ret <4 x float> %select
+}
+; CHECK-LABEL: test3
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test4(<4 x float> %A, <4 x float> %B) {
+  %select = select <4 x i1><i1 false, i1 false, i1 true, i1 true>, <4 x float> %A, <4 x float> %B
+  ret <4 x float> %select
+}
+; CHECK-LABEL: test4
+; CHECK: movsd
+; CHECK-NEXT: ret
diff --git a/test/CodeGen/X86/vselect.ll b/test/CodeGen/X86/vselect.ll
new file mode 100644
index 0000000..0cf03fc
--- /dev/null
+++ b/test/CodeGen/X86/vselect.ll
@@ -0,0 +1,264 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mcpu=corei7 -mattr=-sse4.1 < %s | FileCheck %s
+
+; Verify that we don't emit packed vector shifts instructions if the
+; condition used by the vector select is a vector of constants.
+
+
+define <4 x float> @test1(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test1
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test2(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test3(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <4 x float> @test4(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test4
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps  %xmm1, %xmm0
+; CHECK: ret
+
+
+define <4 x float> @test5(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test5
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test6(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 false, i1 true, i1 false, i1 true, i1 false, i1 true, i1 false>, <8 x i16> %a, <8 x i16> %a
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test6
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test7(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test7
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+define <8 x i16> @test8(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test8
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test9(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false, i1 false>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test9
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: movaps  %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test10(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true, i1 true>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test10
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test11(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 true, i1 true, i1 false, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test11
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test12(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 undef, i1 false, i1 false, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test12
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+define <8 x i16> @test13(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test13
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK: ret
+
+; Fold (vselect (build_vector AllOnes), N1, N2) -> N1
+
+define <4 x float> @test14(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 undef, i1 true, i1 undef>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test14
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: pcmpeq
+; CHECK: ret
+
+define <8 x i16> @test15(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 true, i1 true, i1 true, i1 undef, i1 undef, i1 true, i1 true, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test15
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: pcmpeq
+; CHECK: ret
+
+; Fold (vselect (build_vector AllZeros), N1, N2) -> N2
+
+define <4 x float> @test16(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 undef, i1 false, i1 undef>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+} 
+; CHECK-LABEL: test16
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: ret 
+
+define <8 x i16> @test17(<8 x i16> %a, <8 x i16> %b) {
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 undef, i1 undef, i1 false, i1 false, i1 undef>, <8 x i16> %a, <8 x i16> %b
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test17
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: ret
+
+define <4 x float> @test18(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test18
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <4 x i32> @test19(<4 x i32> %a, <4 x i32> %b) {
+  %1 = select <4 x i1> <i1 false, i1 true, i1 true, i1 true>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test19
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <2 x double> @test20(<2 x double> %a, <2 x double> %b) {
+  %1 = select <2 x i1> <i1 false, i1 true>, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test20
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <2 x i64> @test21(<2 x i64> %a, <2 x i64> %b) {
+  %1 = select <2 x i1> <i1 false, i1 true>, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test21
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <4 x float> @test22(<4 x float> %a, <4 x float> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x float> %a, <4 x float> %b
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test22
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <4 x i32> @test23(<4 x i32> %a, <4 x i32> %b) {
+  %1 = select <4 x i1> <i1 true, i1 false, i1 false, i1 false>, <4 x i32> %a, <4 x i32> %b
+  ret <4 x i32> %1
+}
+; CHECK-LABEL: test23
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movss
+; CHECK: ret
+
+define <2 x double> @test24(<2 x double> %a, <2 x double> %b) {
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %a, <2 x double> %b
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test24
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
+define <2 x i64> @test25(<2 x i64> %a, <2 x i64> %b) {
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x i64> %a, <2 x i64> %b
+  ret <2 x i64> %1
+}
+; CHECK-LABEL: test25
+; CHECK-NOT: psllw
+; CHECK-NOT: psraw
+; CHECK-NOT: xorps
+; CHECK: movsd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/vshift-4.ll b/test/CodeGen/X86/vshift-4.ll
index 4363cd9..a060cf8 100644
--- a/test/CodeGen/X86/vshift-4.ll
+++ b/test/CodeGen/X86/vshift-4.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86 -mattr=+sse2 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
 
 ; test vector shifts converted to proper SSE2 vector shifts when the shift
 ; amounts are the same when using a shuffle splat.
diff --git a/test/CodeGen/X86/vshift-6.ll b/test/CodeGen/X86/vshift-6.ll
new file mode 100644
index 0000000..f50d9a6
--- /dev/null
+++ b/test/CodeGen/X86/vshift-6.ll
@@ -0,0 +1,36 @@
+; RUN: llc < %s -mcpu=corei7 -march=x86-64 -mattr=+sse2  | FileCheck %s
+
+; This test makes sure that the compiler does not crash with an
+; assertion failure when trying to fold a vector shift left
+; by immediate count if the type of the input vector is different
+; to the result type.
+;
+; This happens for example when lowering a shift left of a MVT::v16i8 vector.
+; This is custom lowered into the following sequence:
+;     count << 5
+;     A =  VSHLI(MVT::v8i16, r & (char16)15, 4)
+;     B = BITCAST MVT::v16i8, A
+;     VSELECT(r, B, count);
+;     count += count
+;     C = VSHLI(MVT::v8i16, r & (char16)63, 2)
+;     D = BITCAST MVT::v16i8, C
+;     r = VSELECT(r, C, count);
+;     count += count
+;     VSELECT(r, r+r, count);
+;     count = count << 5;
+;
+; Where 'r' is a vector of type MVT::v16i8, and
+; 'count' is the vector shift count.
+
+define <16 x i8> @do_not_crash(i8*, i32*, i64*, i32, i64, i8) {
+entry:
+  store i8 %5, i8* %0
+  %L5 = load i8* %0
+  %I8 = insertelement <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, i8 %L5, i32 7
+  %B51 = shl <16 x i8> <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>, %I8
+  ret <16 x i8> %B51
+}
+
+; CHECK-LABEL: do_not_crash
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/warn-stack.ll b/test/CodeGen/X86/warn-stack.ll
index 5979f45..a76fd28 100644
--- a/test/CodeGen/X86/warn-stack.ll
+++ b/test/CodeGen/X86/warn-stack.ll
@@ -12,7 +12,7 @@ entry:
   ret void
 }
 
-; CHECK: warning: Stack size limit exceeded (104) in warn.
+; CHECK: warning: stack size limit exceeded (104) in warn
 define void @warn() nounwind ssp {
 entry:
   %buffer = alloca [80 x i8], align 1
diff --git a/test/CodeGen/X86/weak_def_can_be_hidden.ll b/test/CodeGen/X86/weak_def_can_be_hidden.ll
index f78f357..b17f372 100644
--- a/test/CodeGen/X86/weak_def_can_be_hidden.ll
+++ b/test/CodeGen/X86/weak_def_can_be_hidden.ll
@@ -1,26 +1,51 @@
-; RUN: llc -mtriple=x86_64-apple-darwin  -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin11 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin10 -O0 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
+; RUN: llc -mtriple=i686-apple-darwin9 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
+; RUN: llc -mtriple=i686-apple-darwin8 -O0 < %s | FileCheck --check-prefix=CHECK-D89 %s
 
-@v1 = linkonce_odr global i32 32
+@v1 = linkonce_odr constant i32 32
 ; CHECK: .globl  _v1
 ; CHECK: .weak_def_can_be_hidden _v1
 
+; CHECK-D89: .globl  _v1
+; CHECK-D89: .weak_definition _v1
+
 define i32 @f1() {
   %x = load i32 * @v1
   ret i32 %x
 }
 
-@v2 = linkonce_odr global i32 32
+@v2 = linkonce_odr constant i32 32
 ; CHECK: .globl  _v2
 ; CHECK: .weak_definition _v2
 
-@v3 = linkonce_odr unnamed_addr global i32 32
-; CHECK: .globl  _v3
-; CHECK: .weak_def_can_be_hidden _v3
+; CHECK-D89: .globl  _v2
+; CHECK-D89: .weak_definition _v2
 
 define i32* @f2() {
   ret i32* @v2
 }
 
+@v3 = linkonce_odr unnamed_addr global i32 32
+; CHECK: .globl  _v3
+; CHECK: .weak_def_can_be_hidden _v3
+
+; CHECK-D89: .globl  _v3
+; CHECK-D89: .weak_definition _v3
+
 define i32* @f3() {
   ret i32* @v3
 }
+
+@v4 = linkonce_odr global i32 32
+; CHECK: .globl  _v4
+; CHECK: .weak_definition _v4
+
+; CHECK-D89: .globl  _v4
+; CHECK-D89: .weak_definition _v4
+
+define i32 @f4() {
+  %x = load i32 * @v4
+  ret i32 %x
+}
diff --git a/test/CodeGen/X86/widen_load-2.ll b/test/CodeGen/X86/widen_load-2.ll
index 26815a4..41bea85 100644
--- a/test/CodeGen/X86/widen_load-2.ll
+++ b/test/CodeGen/X86/widen_load-2.ll
@@ -149,9 +149,9 @@ define void @add31i8(%i8vec31* nocapture sret %ret, %i8vec31* %ap, %i8vec31* %bp
 ; CHECK: movdqa
 ; CHECK: paddb
 ; CHECK: paddb
-; CHECK: movq
 ; CHECK: pextrb
 ; CHECK: pextrw
+; CHECK: movq
 ; CHECK: ret
 	%a = load %i8vec31* %ap, align 16
 	%b = load %i8vec31* %bp, align 16
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index a24963a..d8ecd44 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -1,11 +1,11 @@
 ; We specify -mcpu explicitly to avoid instruction reordering that happens on
 ; some setups (e.g., Atom) from affecting the output.
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
-; RUN: llc < %s -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
-; RUN: llc < %s -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
-; RUN: llc < %s -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
-; RUN: llc < %s -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 
 ; The SysV ABI used by most Unixes and Mingw on x86 specifies that an sret pointer
 ; is callee-cleanup. However, in MSVC's cdecl calling convention, sret pointer
@@ -13,16 +13,16 @@
 
 define void @sret1(i8* sret %x) nounwind {
 entry:
-; WIN32:      sret1
+; WIN32-LABEL:      _sret1:
 ; WIN32:      movb $42, (%eax)
 ; WIN32-NOT:  popl %eax
-; WIN32:    {{ret$}}
+; WIN32:    {{retl$}}
 
-; MINGW_X86:  sret1
-; MINGW_X86:  ret $4
+; MINGW_X86-LABEL:  _sret1:
+; MINGW_X86:  {{retl$}}
 
-; LINUX:      sret1
-; LINUX:      ret $4
+; LINUX-LABEL:      sret1:
+; LINUX:      retl $4
 
   store i8 42, i8* %x, align 4
   ret void
@@ -30,16 +30,16 @@ entry:
 
 define void @sret2(i8* sret %x, i8 %y) nounwind {
 entry:
-; WIN32:      sret2
+; WIN32-LABEL:      _sret2:
 ; WIN32:      movb {{.*}}, (%eax)
 ; WIN32-NOT:  popl %eax
-; WIN32:    {{ret$}}
+; WIN32:    {{retl$}}
 
-; MINGW_X86:  sret2
-; MINGW_X86:  ret $4
+; MINGW_X86-LABEL:  _sret2:
+; MINGW_X86:  {{retl$}}
 
-; LINUX:      sret2
-; LINUX:      ret $4
+; LINUX-LABEL:      sret2:
+; LINUX:      retl $4
 
   store i8 %y, i8* %x
   ret void
@@ -47,17 +47,17 @@ entry:
 
 define void @sret3(i8* sret %x, i8* %y) nounwind {
 entry:
-; WIN32:      sret3
+; WIN32-LABEL:      _sret3:
 ; WIN32:      movb $42, (%eax)
 ; WIN32-NOT:  movb $13, (%eax)
 ; WIN32-NOT:  popl %eax
-; WIN32:    {{ret$}}
+; WIN32:    {{retl$}}
 
-; MINGW_X86:  sret3
-; MINGW_X86:  ret $4
+; MINGW_X86-LABEL:  _sret3:
+; MINGW_X86:  {{retl$}}
 
-; LINUX:      sret3
-; LINUX:      ret $4
+; LINUX-LABEL:      sret3:
+; LINUX:      retl $4
 
   store i8 42, i8* %x
   store i8 13, i8* %y
@@ -69,16 +69,16 @@ entry:
 
 define void @sret4(%struct.S4* noalias sret %agg.result) {
 entry:
-; WIN32:     sret4
+; WIN32-LABEL:     _sret4:
 ; WIN32:     movl $42, (%eax)
 ; WIN32-NOT: popl %eax
-; WIN32:   {{ret$}}
+; WIN32:   {{retl$}}
 
-; MINGW_X86: sret4
-; MINGW_X86: ret $4
+; MINGW_X86-LABEL: _sret4:
+; MINGW_X86: {{retl$}}
 
-; LINUX:     sret4
-; LINUX:     ret $4
+; LINUX-LABEL:     sret4:
+; LINUX:     retl $4
 
   %x = getelementptr inbounds %struct.S4* %agg.result, i32 0, i32 0
   store i32 42, i32* %x, align 4
@@ -96,14 +96,16 @@ entry:
   %x = getelementptr inbounds %struct.S5* %agg.result, i32 0, i32 0
   store i32 42, i32* %x, align 4
   ret void
-; WIN32:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; WIN32-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; MINGW_X86-LABEL: {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; LINUX-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 
 ; The address of the return structure is passed as an implicit parameter.
 ; In the -O0 build, %eax is spilled at the beginning of the function, hence we
 ; should match both 4(%esp) and 8(%esp).
 ; WIN32:     {{[48]}}(%esp), %eax
 ; WIN32:     movl $42, (%eax)
-; WIN32:     ret $4
+; WIN32:     retl $4
 }
 
 define void @call_foo5() {
@@ -111,7 +113,10 @@ entry:
   %c = alloca %class.C5, align 1
   %s = alloca %struct.S5, align 4
   call x86_thiscallcc void @"\01?foo@C5@@QAE?AUS5@@XZ"(%struct.S5* sret %s, %class.C5* %c)
-; WIN32:      {{^}}_call_foo5:
+; WIN32-LABEL:      {{^}}_call_foo5:
+; MINGW_X86-LABEL:  {{^}}_call_foo5:
+; LINUX-LABEL:      {{^}}call_foo5:
+
 
 ; Load the address of the result and put it onto stack
 ; (through %ecx in the -O0 build).
@@ -121,6 +126,35 @@ entry:
 ; The this pointer goes to ECX.
 ; WIN32-NEXT: leal {{[0-9]+}}(%esp), %ecx
 ; WIN32-NEXT: calll "?foo@C5@@QAE?AUS5@@XZ"
-; WIN32:      ret
+; WIN32:      retl
+  ret void
+}
+
+
+%struct.test6 = type { i32, i32, i32 }
+define void @test6_f(%struct.test6* %x) nounwind {
+; WIN32-LABEL: _test6_f:
+; MINGW_X86-LABEL: _test6_f:
+; LINUX-LABEL: test6_f:
+
+; The %x argument is moved to %ecx. It will be the this pointer.
+; WIN32: movl    8(%ebp), %ecx
+
+; The %x argument is moved to (%esp). It will be the this pointer. With -O0
+; we copy esp to ecx and use (ecx) instead of (esp).
+; MINGW_X86: movl    8(%ebp), %eax
+; MINGW_X86: movl    %eax, (%e{{([a-d]x)|(sp)}})
+
+; The sret pointer is (%esp)
+; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
+; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+
+; The sret pointer is %ecx
+; MINGW_X86-NEXT: leal    8(%esp), %ecx
+; MINGW_X86-NEXT: calll   _test6_g
+
+  %tmp = alloca %struct.test6, align 4
+  call x86_thiscallcc void @test6_g(%struct.test6* sret %tmp, %struct.test6* %x)
   ret void
 }
+declare x86_thiscallcc void @test6_g(%struct.test6* sret, %struct.test6*)
diff --git a/test/CodeGen/X86/win64_alloca_dynalloca.ll b/test/CodeGen/X86/win64_alloca_dynalloca.ll
index aff5305..a6b6536 100644
--- a/test/CodeGen/X86/win64_alloca_dynalloca.ll
+++ b/test/CodeGen/X86/win64_alloca_dynalloca.ll
@@ -12,11 +12,11 @@ entry:
 
   %buf0 = alloca i8, i64 4096, align 1
 
-; ___chkstk must adjust %rsp.
+; ___chkstk_ms does not adjust %rsp.
 ; M64: movq  %rsp, %rbp
 ; M64:       $4096, %rax
-; M64: callq ___chkstk
-; M64-NOT:   %rsp
+; M64: callq ___chkstk_ms
+; M64: subq  %rax, %rsp
 
 ; __chkstk does not adjust %rsp.
 ; W64: movq  %rsp, %rbp
diff --git a/test/CodeGen/X86/win_chkstk.ll b/test/CodeGen/X86/win_chkstk.ll
index 3f522ea..0c02c1a 100644
--- a/test/CodeGen/X86/win_chkstk.ll
+++ b/test/CodeGen/X86/win_chkstk.ll
@@ -17,7 +17,7 @@ entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
 ; MINGW_X32:  calll __alloca
-; MINGW_X64:  callq ___chkstk
+; MINGW_X64:  callq ___chkstk_ms
 ; LINUX-NOT:  call __chkstk
   %array4096 = alloca [4096 x i8], align 16       ; <[4096 x i8]*> [#uses=0]
   ret i32 0
@@ -36,7 +36,7 @@ entry:
 ; WIN_X64:       ret
 
 ; MINGW_X64:     # BB#0:
-; MINGW_X64-NOT: callq _alloca
+; MINGW_X64-NOT: callq ___chkstk_ms
 ; MINGW_X64:     ret
 
 ; LINUX:         # BB#0:
@@ -53,7 +53,7 @@ entry:
 ; WIN_X32:    calll __chkstk
 ; WIN_X64:    callq __chkstk
 ; MINGW_X32:  calll __alloca
-; MINGW_X64:  callq ___chkstk
+; MINGW_X64:  callq ___chkstk_ms
 ; LINUX-NOT:  call __chkstk
   %array4096 = alloca [4096 x i8], align 16       ; <[4096 x i8]*> [#uses=0]
   ret i32 0
diff --git a/test/CodeGen/X86/x86-64-double-precision-shift-left.ll b/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
new file mode 100644
index 0000000..f2380f2
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-precision-shift-left.ll
@@ -0,0 +1,77 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence 
+; of instructions with lower latencies instead of shld instruction.
+
+;uint64_t lshift1(uint64_t a, uint64_t b)
+;{
+;    return (a << 1) | (b >> 63);
+;}
+
+; CHECK:             lshift1:
+; CHECK:             addq    {{.*}},{{.*}}
+; CHECK-NEXT:        shrq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @lshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 1
+  %shr = lshr i64 %b, 63
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift2(uint64_t a, uint64_t b)
+;{
+;    return (a << 2) | (b >> 62);
+;}
+
+; CHECK:             lshift2:
+; CHECK:             shlq    $2, {{.*}}
+; CHECK-NEXT:        shrq    $62, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 2
+  %shr = lshr i64 %b, 62
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift7(uint64_t a, uint64_t b)
+;{
+;    return (a << 7) | (b >> 57);
+;}
+
+; CHECK:             lshift7:
+; CHECK:             shlq    $7, {{.*}}
+; CHECK-NEXT:        shrq    $57, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 7
+  %shr = lshr i64 %b, 57
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t lshift63(uint64_t a, uint64_t b)
+;{
+;    return (a << 63) | (b >> 1);
+;}
+
+; CHECK:             lshift63:
+; CHECK:             shlq    $63, {{.*}}
+; CHECK-NEXT:        shrq    {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @lshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+entry:
+  %shl = shl i64 %a, 63
+  %shr = lshr i64 %b, 1
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
diff --git a/test/CodeGen/X86/x86-64-double-precision-shift-right.ll b/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
new file mode 100644
index 0000000..5edaad8
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-precision-shift-right.ll
@@ -0,0 +1,74 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; Verify that for the architectures that are known to have poor latency
+; double precision shift instructions we generate alternative sequence 
+; of instructions with lower latencies instead of shrd instruction.
+
+;uint64_t rshift1(uint64_t a, uint64_t b)
+;{
+;    return (a >> 1) | (b << 63);
+;}
+
+; CHECK:             rshift1:
+; CHECK:             shrq    {{.*}}
+; CHECK-NEXT:        shlq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+define i64 @rshift1(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 1
+  %2 = shl i64 %b, 63
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift2(uint64_t a, uint64_t b)
+;{
+;    return (a >> 2) | (b << 62);
+;}
+
+; CHECK:             rshift2:
+; CHECK:             shrq    $2, {{.*}}
+; CHECK-NEXT:        shlq    $62, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift2(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 2
+  %2 = shl i64 %b, 62
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift7(uint64_t a, uint64_t b)
+;{
+;    return (a >> 7) | (b << 57);
+;}
+
+; CHECK:             rshift7:
+; CHECK:             shrq    $7, {{.*}}
+; CHECK-NEXT:        shlq    $57, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+
+
+define i64 @rshift7(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 7
+  %2 = shl i64 %b, 57
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
+
+;uint64_t rshift63(uint64_t a, uint64_t b)
+;{
+;    return (a >> 63) | (b << 1);
+;}
+
+; CHECK:             rshift63:
+; CHECK:             shrq    $63, {{.*}}
+; CHECK-NEXT:        leaq    ({{.*}},{{.*}}), {{.*}}
+; CHECK-NEXT:        orq     {{.*}}, {{.*}}
+
+define i64 @rshift63(i64 %a, i64 %b) nounwind readnone uwtable {
+  %1 = lshr i64 %a, 63
+  %2 = shl i64 %b, 1
+  %3 = or i64 %2, %1
+  ret i64 %3
+}
diff --git a/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
new file mode 100644
index 0000000..5d7a10b
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-shifts-Oz-Os-O2.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+
+; clang -Oz -c test1.cpp -emit-llvm -S -o
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double 
+; precision shift instuctions.
+; uint64_t lshift10(uint64_t a, uint64_t b)
+; {
+;     return (a << 10) | (b >> 54);
+; }
+
+; Function Attrs: minsize nounwind optsize readnone uwtable
+define i64 @_Z8lshift10mm(i64 %a, i64 %b) #0 {
+entry:
+; CHECK:   shldq   $10
+  %shl = shl i64 %a, 10
+  %shr = lshr i64 %b, 54
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #0 = { minsize nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+
+; clang -Os -c test2.cpp -emit-llvm -S
+; Verify that we generate shld insruction when we are optimizing for size,
+; even for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift11(uint64_t a, uint64_t b)
+; {
+;     return (a << 11) | (b >> 53);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift11mm(i64 %a, i64 %b) #1 {
+entry:
+; CHECK:   shldq   $11
+  %shl = shl i64 %a, 11
+  %shr = lshr i64 %b, 53
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #1 = { nounwind optsize readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+; clang -O2 -c test2.cpp -emit-llvm -S
+; Verify that we do not generate shld insruction when we are not optimizing
+; for size for X86_64 processors that are known to have poor latency double
+; precision shift instuctions.
+; uint64_t lshift12(uint64_t a, uint64_t b)
+; {
+;     return (a << 12) | (b >> 52);
+; }
+
+; Function Attrs: nounwind optsize readnone uwtable
+define i64 @_Z8lshift12mm(i64 %a, i64 %b) #2 {
+entry:
+; CHECK:       shlq    $12
+; CHECK-NEXT:  shrq    $52
+  %shl = shl i64 %a, 12
+  %shr = lshr i64 %b, 52
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+attributes #2= { nounwind readnone uwtable "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-nans-fp-math"="false" "stack-protector-buffer-size"="8" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
diff --git a/test/CodeGen/X86/x86-64-double-shifts-var.ll b/test/CodeGen/X86/x86-64-double-shifts-var.ll
new file mode 100644
index 0000000..5bab434
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-double-shifts-var.ll
@@ -0,0 +1,57 @@
+; RUN: llc < %s -march=x86-64 -mcpu=athlon | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-tbird | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-4 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-xp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-mp | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon-fx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=k8-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=opteron-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=athlon64-sse3 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=amdfam10 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=btver2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver1 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=bdver2 | FileCheck %s
+
+; Verify that for the X86_64 processors that are known to have poor latency 
+; double precision shift instructions we do not generate 'shld' or 'shrd'
+; instructions.
+
+;uint64_t lshift(uint64_t a, uint64_t b, int c)
+;{
+;    return (a << c) | (b >> (64-c));
+;}
+
+define i64 @lshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shld 
+  %sh_prom = zext i32 %c to i64
+  %shl = shl i64 %a, %sh_prom
+  %sub = sub nsw i32 64, %c
+  %sh_prom1 = zext i32 %sub to i64
+  %shr = lshr i64 %b, %sh_prom1
+  %or = or i64 %shr, %shl
+  ret i64 %or
+}
+
+;uint64_t rshift(uint64_t a, uint64_t b, int c)
+;{
+;    return (a >> c) | (b << (64-c));
+;}
+
+define i64 @rshift(i64 %a, i64 %b, i32 %c) nounwind readnone {
+entry:
+; CHECK-NOT: shrd
+  %sh_prom = zext i32 %c to i64
+  %shr = lshr i64 %a, %sh_prom
+  %sub = sub nsw i32 64, %c
+  %sh_prom1 = zext i32 %sub to i64
+  %shl = shl i64 %b, %sh_prom1
+  %or = or i64 %shl, %shr
+  ret i64 %or
+}
+
+
diff --git a/test/CodeGen/X86/x86-shifts.ll b/test/CodeGen/X86/x86-shifts.ll
index 2f3adb8..ec47933 100644
--- a/test/CodeGen/X86/x86-shifts.ll
+++ b/test/CodeGen/X86/x86-shifts.ll
@@ -100,7 +100,7 @@ entry:
   ret <8 x i16> %K
 }
 
-; non splat test
+; non-splat test
 
 
 define <8 x i16> @sll8_nosplat(<8 x i16> %A) nounwind {
diff --git a/test/CodeGen/X86/zlib-longest-match.ll b/test/CodeGen/X86/zlib-longest-match.ll
new file mode 100644
index 0000000..d1598dc
--- /dev/null
+++ b/test/CodeGen/X86/zlib-longest-match.ll
@@ -0,0 +1,240 @@
+; RUN: llc -march=x86-64 < %s -block-placement-exit-block-bias=20 | FileCheck %s
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; This is longest_match, the hot function from zlib's deflate implementation.
+
+%struct.internal_state = type { %struct.z_stream_s*, i32, i8*, i64, i8*, i32, i32, %struct.gz_header_s*, i32, i8, i32, i32, i32, i32, i8*, i64, i16*, i16*, i32, i32, i32, i32, i32, i64, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, [573 x %struct.ct_data_s], [61 x %struct.ct_data_s], [39 x %struct.ct_data_s], %struct.tree_desc_s, %struct.tree_desc_s, %struct.tree_desc_s, [16 x i16], [573 x i32], i32, i32, [573 x i8], i8*, i32, i32, i16*, i64, i64, i32, i32, i16, i32, i64 }
+%struct.z_stream_s = type { i8*, i32, i64, i8*, i32, i64, i8*, %struct.internal_state*, i8* (i8*, i32, i32)*, void (i8*, i8*)*, i8*, i32, i64, i64 }
+%struct.gz_header_s = type { i32, i64, i32, i32, i8*, i32, i32, i8*, i32, i8*, i32, i32, i32 }
+%struct.ct_data_s = type { %union.anon, %union.anon.0 }
+%union.anon = type { i16 }
+%union.anon.0 = type { i16 }
+%struct.tree_desc_s = type { %struct.ct_data_s*, i32, %struct.static_tree_desc_s* }
+%struct.static_tree_desc_s = type { i32 }
+
+; CHECK-LABEL: longest_match:
+
+; Verify that there are no spills or reloads in the loop exit block. This loop
+; is mostly cold, only %do.cond125 and %land.rhs131 are hot.
+; CHECK: %do.cond125
+; CHECK-NOT: {{Spill|Reload}}
+; CHECK: jbe
+
+; Verify that block placement doesn't destroy source order. It's important that
+; the two hot blocks are laid out close to each other.
+; CHECK-NEXT: %land.rhs131
+; CHECK: jne
+; CHECK: jmp
+define i32 @longest_match(%struct.internal_state* nocapture %s, i32 %cur_match) nounwind {
+entry:
+  %max_chain_length = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 31
+  %0 = load i32* %max_chain_length, align 4
+  %window = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 14
+  %1 = load i8** %window, align 8
+  %strstart = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 27
+  %2 = load i32* %strstart, align 4
+  %idx.ext = zext i32 %2 to i64
+  %add.ptr = getelementptr inbounds i8* %1, i64 %idx.ext
+  %prev_length = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 30
+  %3 = load i32* %prev_length, align 4
+  %nice_match1 = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 36
+  %4 = load i32* %nice_match1, align 4
+  %w_size = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 11
+  %5 = load i32* %w_size, align 4
+  %sub = add i32 %5, -262
+  %cmp = icmp ugt i32 %2, %sub
+  %sub6 = sub i32 %2, %sub
+  %sub6. = select i1 %cmp, i32 %sub6, i32 0
+  %prev7 = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 16
+  %6 = load i16** %prev7, align 8
+  %w_mask = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 13
+  %7 = load i32* %w_mask, align 4
+  %add.ptr11.sum = add i64 %idx.ext, 258
+  %add.ptr12 = getelementptr inbounds i8* %1, i64 %add.ptr11.sum
+  %sub13 = add nsw i32 %3, -1
+  %idxprom = sext i32 %sub13 to i64
+  %add.ptr.sum = add i64 %idxprom, %idx.ext
+  %arrayidx = getelementptr inbounds i8* %1, i64 %add.ptr.sum
+  %8 = load i8* %arrayidx, align 1
+  %idxprom14 = sext i32 %3 to i64
+  %add.ptr.sum213 = add i64 %idxprom14, %idx.ext
+  %arrayidx15 = getelementptr inbounds i8* %1, i64 %add.ptr.sum213
+  %9 = load i8* %arrayidx15, align 1
+  %good_match = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 35
+  %10 = load i32* %good_match, align 4
+  %cmp17 = icmp ult i32 %3, %10
+  %shr = lshr i32 %0, 2
+  %chain_length.0 = select i1 %cmp17, i32 %0, i32 %shr
+  %lookahead = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 29
+  %11 = load i32* %lookahead, align 4
+  %cmp18 = icmp ugt i32 %4, %11
+  %. = select i1 %cmp18, i32 %11, i32 %4
+  %match_start = getelementptr inbounds %struct.internal_state* %s, i64 0, i32 28
+  %add.ptr.sum217 = add i64 %idx.ext, 1
+  %arrayidx44 = getelementptr inbounds i8* %1, i64 %add.ptr.sum217
+  %add.ptr.sum218 = add i64 %idx.ext, 2
+  %add.ptr50 = getelementptr inbounds i8* %1, i64 %add.ptr.sum218
+  %sub.ptr.lhs.cast = ptrtoint i8* %add.ptr12 to i64
+  br label %do.body
+
+do.body:                                          ; preds = %land.rhs131, %entry
+  %best_len.0 = phi i32 [ %best_len.1, %land.rhs131 ], [ %3, %entry ]
+  %chain_length.1 = phi i32 [ %dec, %land.rhs131 ], [ %chain_length.0, %entry ]
+  %cur_match.addr.0 = phi i32 [ %conv128, %land.rhs131 ], [ %cur_match, %entry ]
+  %scan_end1.0 = phi i8 [ %scan_end1.1, %land.rhs131 ], [ %8, %entry ]
+  %scan_end.0 = phi i8 [ %scan_end.1, %land.rhs131 ], [ %9, %entry ]
+  %idx.ext23 = zext i32 %cur_match.addr.0 to i64
+  %add.ptr24 = getelementptr inbounds i8* %1, i64 %idx.ext23
+  %idxprom25 = sext i32 %best_len.0 to i64
+  %add.ptr24.sum = add i64 %idx.ext23, %idxprom25
+  %arrayidx26 = getelementptr inbounds i8* %1, i64 %add.ptr24.sum
+  %12 = load i8* %arrayidx26, align 1
+  %cmp28 = icmp eq i8 %12, %scan_end.0
+  br i1 %cmp28, label %lor.lhs.false, label %do.cond125
+
+lor.lhs.false:                                    ; preds = %do.body
+  %sub30 = add nsw i32 %best_len.0, -1
+  %idxprom31 = sext i32 %sub30 to i64
+  %add.ptr24.sum214 = add i64 %idx.ext23, %idxprom31
+  %arrayidx32 = getelementptr inbounds i8* %1, i64 %add.ptr24.sum214
+  %13 = load i8* %arrayidx32, align 1
+  %cmp35 = icmp eq i8 %13, %scan_end1.0
+  br i1 %cmp35, label %lor.lhs.false37, label %do.cond125
+
+lor.lhs.false37:                                  ; preds = %lor.lhs.false
+  %14 = load i8* %add.ptr24, align 1
+  %15 = load i8* %add.ptr, align 1
+  %cmp40 = icmp eq i8 %14, %15
+  br i1 %cmp40, label %lor.lhs.false42, label %do.cond125
+
+lor.lhs.false42:                                  ; preds = %lor.lhs.false37
+  %add.ptr24.sum215 = add i64 %idx.ext23, 1
+  %incdec.ptr = getelementptr inbounds i8* %1, i64 %add.ptr24.sum215
+  %16 = load i8* %incdec.ptr, align 1
+  %17 = load i8* %arrayidx44, align 1
+  %cmp46 = icmp eq i8 %16, %17
+  br i1 %cmp46, label %if.end49, label %do.cond125
+
+if.end49:                                         ; preds = %lor.lhs.false42
+  %incdec.ptr.sum = add i64 %idx.ext23, 2
+  %incdec.ptr51 = getelementptr inbounds i8* %1, i64 %incdec.ptr.sum
+  br label %do.cond
+
+do.cond:                                          ; preds = %land.lhs.true100, %if.end49
+  %match.0 = phi i8* [ %incdec.ptr51, %if.end49 ], [ %incdec.ptr103, %land.lhs.true100 ]
+  %scan.1 = phi i8* [ %add.ptr50, %if.end49 ], [ %incdec.ptr101, %land.lhs.true100 ]
+  %incdec.ptr53 = getelementptr inbounds i8* %scan.1, i64 1
+  %18 = load i8* %incdec.ptr53, align 1
+  %incdec.ptr55 = getelementptr inbounds i8* %match.0, i64 1
+  %19 = load i8* %incdec.ptr55, align 1
+  %cmp57 = icmp eq i8 %18, %19
+  br i1 %cmp57, label %land.lhs.true, label %do.end
+
+land.lhs.true:                                    ; preds = %do.cond
+  %incdec.ptr59 = getelementptr inbounds i8* %scan.1, i64 2
+  %20 = load i8* %incdec.ptr59, align 1
+  %incdec.ptr61 = getelementptr inbounds i8* %match.0, i64 2
+  %21 = load i8* %incdec.ptr61, align 1
+  %cmp63 = icmp eq i8 %20, %21
+  br i1 %cmp63, label %land.lhs.true65, label %do.end
+
+land.lhs.true65:                                  ; preds = %land.lhs.true
+  %incdec.ptr66 = getelementptr inbounds i8* %scan.1, i64 3
+  %22 = load i8* %incdec.ptr66, align 1
+  %incdec.ptr68 = getelementptr inbounds i8* %match.0, i64 3
+  %23 = load i8* %incdec.ptr68, align 1
+  %cmp70 = icmp eq i8 %22, %23
+  br i1 %cmp70, label %land.lhs.true72, label %do.end
+
+land.lhs.true72:                                  ; preds = %land.lhs.true65
+  %incdec.ptr73 = getelementptr inbounds i8* %scan.1, i64 4
+  %24 = load i8* %incdec.ptr73, align 1
+  %incdec.ptr75 = getelementptr inbounds i8* %match.0, i64 4
+  %25 = load i8* %incdec.ptr75, align 1
+  %cmp77 = icmp eq i8 %24, %25
+  br i1 %cmp77, label %land.lhs.true79, label %do.end
+
+land.lhs.true79:                                  ; preds = %land.lhs.true72
+  %incdec.ptr80 = getelementptr inbounds i8* %scan.1, i64 5
+  %26 = load i8* %incdec.ptr80, align 1
+  %incdec.ptr82 = getelementptr inbounds i8* %match.0, i64 5
+  %27 = load i8* %incdec.ptr82, align 1
+  %cmp84 = icmp eq i8 %26, %27
+  br i1 %cmp84, label %land.lhs.true86, label %do.end
+
+land.lhs.true86:                                  ; preds = %land.lhs.true79
+  %incdec.ptr87 = getelementptr inbounds i8* %scan.1, i64 6
+  %28 = load i8* %incdec.ptr87, align 1
+  %incdec.ptr89 = getelementptr inbounds i8* %match.0, i64 6
+  %29 = load i8* %incdec.ptr89, align 1
+  %cmp91 = icmp eq i8 %28, %29
+  br i1 %cmp91, label %land.lhs.true93, label %do.end
+
+land.lhs.true93:                                  ; preds = %land.lhs.true86
+  %incdec.ptr94 = getelementptr inbounds i8* %scan.1, i64 7
+  %30 = load i8* %incdec.ptr94, align 1
+  %incdec.ptr96 = getelementptr inbounds i8* %match.0, i64 7
+  %31 = load i8* %incdec.ptr96, align 1
+  %cmp98 = icmp eq i8 %30, %31
+  br i1 %cmp98, label %land.lhs.true100, label %do.end
+
+land.lhs.true100:                                 ; preds = %land.lhs.true93
+  %incdec.ptr101 = getelementptr inbounds i8* %scan.1, i64 8
+  %32 = load i8* %incdec.ptr101, align 1
+  %incdec.ptr103 = getelementptr inbounds i8* %match.0, i64 8
+  %33 = load i8* %incdec.ptr103, align 1
+  %cmp105 = icmp eq i8 %32, %33
+  %cmp107 = icmp ult i8* %incdec.ptr101, %add.ptr12
+  %or.cond = and i1 %cmp105, %cmp107
+  br i1 %or.cond, label %do.cond, label %do.end
+
+do.end:                                           ; preds = %land.lhs.true100, %land.lhs.true93, %land.lhs.true86, %land.lhs.true79, %land.lhs.true72, %land.lhs.true65, %land.lhs.true, %do.cond
+  %scan.2 = phi i8* [ %incdec.ptr101, %land.lhs.true100 ], [ %incdec.ptr94, %land.lhs.true93 ], [ %incdec.ptr87, %land.lhs.true86 ], [ %incdec.ptr80, %land.lhs.true79 ], [ %incdec.ptr73, %land.lhs.true72 ], [ %incdec.ptr66, %land.lhs.true65 ], [ %incdec.ptr59, %land.lhs.true ], [ %incdec.ptr53, %do.cond ]
+  %sub.ptr.rhs.cast = ptrtoint i8* %scan.2 to i64
+  %sub.ptr.sub = sub i64 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %conv109 = trunc i64 %sub.ptr.sub to i32
+  %sub110 = sub nsw i32 258, %conv109
+  %cmp112 = icmp sgt i32 %sub110, %best_len.0
+  br i1 %cmp112, label %if.then114, label %do.cond125
+
+if.then114:                                       ; preds = %do.end
+  store i32 %cur_match.addr.0, i32* %match_start, align 4
+  %cmp115 = icmp slt i32 %sub110, %.
+  br i1 %cmp115, label %if.end118, label %do.end135
+
+if.end118:                                        ; preds = %if.then114
+  %sub119 = add nsw i32 %sub110, -1
+  %idxprom120 = sext i32 %sub119 to i64
+  %add.ptr111.sum = add i64 %idxprom120, %idx.ext
+  %arrayidx121 = getelementptr inbounds i8* %1, i64 %add.ptr111.sum
+  %34 = load i8* %arrayidx121, align 1
+  %idxprom122 = sext i32 %sub110 to i64
+  %add.ptr111.sum216 = add i64 %idxprom122, %idx.ext
+  %arrayidx123 = getelementptr inbounds i8* %1, i64 %add.ptr111.sum216
+  %35 = load i8* %arrayidx123, align 1
+  br label %do.cond125
+
+do.cond125:                                       ; preds = %if.end118, %do.end, %lor.lhs.false42, %lor.lhs.false37, %lor.lhs.false, %do.body
+  %best_len.1 = phi i32 [ %best_len.0, %do.body ], [ %best_len.0, %lor.lhs.false ], [ %best_len.0, %lor.lhs.false37 ], [ %best_len.0, %lor.lhs.false42 ], [ %sub110, %if.end118 ], [ %best_len.0, %do.end ]
+  %scan_end1.1 = phi i8 [ %scan_end1.0, %do.body ], [ %scan_end1.0, %lor.lhs.false ], [ %scan_end1.0, %lor.lhs.false37 ], [ %scan_end1.0, %lor.lhs.false42 ], [ %34, %if.end118 ], [ %scan_end1.0, %do.end ]
+  %scan_end.1 = phi i8 [ %scan_end.0, %do.body ], [ %scan_end.0, %lor.lhs.false ], [ %scan_end.0, %lor.lhs.false37 ], [ %scan_end.0, %lor.lhs.false42 ], [ %35, %if.end118 ], [ %scan_end.0, %do.end ]
+  %and = and i32 %cur_match.addr.0, %7
+  %idxprom126 = zext i32 %and to i64
+  %arrayidx127 = getelementptr inbounds i16* %6, i64 %idxprom126
+  %36 = load i16* %arrayidx127, align 2
+  %conv128 = zext i16 %36 to i32
+  %cmp129 = icmp ugt i32 %conv128, %sub6.
+  br i1 %cmp129, label %land.rhs131, label %do.end135
+
+land.rhs131:                                      ; preds = %do.cond125
+  %dec = add i32 %chain_length.1, -1
+  %cmp132 = icmp eq i32 %dec, 0
+  br i1 %cmp132, label %do.end135, label %do.body
+
+do.end135:                                        ; preds = %land.rhs131, %do.cond125, %if.then114
+  %best_len.2 = phi i32 [ %best_len.1, %land.rhs131 ], [ %best_len.1, %do.cond125 ], [ %sub110, %if.then114 ]
+  %cmp137 = icmp ugt i32 %best_len.2, %11
+  %.best_len.2 = select i1 %cmp137, i32 %11, i32 %best_len.2
+  ret i32 %.best_len.2
+}
author	Stephen Hines <srhines@google.com>	2014-04-23 16:57:46 -0700
committer	Stephen Hines <srhines@google.com>	2014-04-24 15:53:16 -0700
commit	36b56886974eae4f9c5ebc96befd3e7bfe5de338 (patch)
tree	e6cfb69fbbd937f450eeb83bfb83b9da3b01275a /test/CodeGen/X86
parent	69a8640022b04415ae9fac62f8ab090601d8f889 (diff)
download	external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.zip external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.gz external_llvm-36b56886974eae4f9c5ebc96befd3e7bfe5de338.tar.bz2