Update LLVM for 3.5 rebase (r209712).

Change-Id: I149556c940fb7dc92d075273c87ff584f400941f
author: Stephen Hines <srhines@google.com> 2014-05-29 02:49:00 -0700
committer: Stephen Hines <srhines@google.com> 2014-05-29 02:49:00 -0700
commit: dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
tree: dcebc53f2b182f145a2e659393bf9a0472cedf23 /test/CodeGen/X86
parent: 220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
download: external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz
external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2
81 files changed, 4353 insertions, 675 deletions
diff --git a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
index e1f8901..4d7c3a1 100644
--- a/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
+++ b/test/CodeGen/X86/2007-03-15-GEP-Idx-Sink.ll
@@ -1,8 +1,14 @@
-; RUN: llc < %s -march=x86 -mtriple=i686-darwin | \
-; RUN:   grep push | count 3
+; RUN: llc < %s -march=x86 -mtriple=i686-darwin | FileCheck %s
+; RUN: llc < %s -march=x86 -mtriple=i686-darwin -addr-sink-using-gep=1 | FileCheck %s
 
 define void @foo(i8** %buf, i32 %size, i32 %col, i8* %p) nounwind {
 entry:
+; CHECK-LABEL: @foo
+; CHECK: push
+; CHECK: push
+; CHECK: push
+; CHECK-NOT: push
+
 	icmp sgt i32 %size, 0		; <i1>:0 [#uses=1]
 	br i1 %0, label %bb.preheader, label %return
 
diff --git a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
index e673d31..e64375a 100644
--- a/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
+++ b/test/CodeGen/X86/2008-03-12-ThreadLocalAlias.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -relocation-model=pic | grep TLSGD | count 2
+; RUN: llc < %s -relocation-model=pic | FileCheck %s
 ; PR2137
 
 ; ModuleID = '1.c'
@@ -11,6 +11,8 @@ target triple = "i386-pc-linux-gnu"
 @__libc_resp = hidden alias %struct.__res_state** @__resp		; <%struct.__res_state**> [#uses=2]
 
 define i32 @foo() {
+; CHECK-LABEL: foo:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
@@ -24,6 +26,8 @@ return:		; preds = %entry
 }
 
 define i32 @bar() {
+; CHECK-LABEL: bar:
+; CHECK: leal    __libc_resp@TLSLD
 entry:
 	%retval = alloca i32		; <i32*> [#uses=1]
 	%"alloca point" = bitcast i32 0 to i32		; <i32> [#uses=0]
diff --git a/test/CodeGen/X86/2010-08-04-StackVariable.ll b/test/CodeGen/X86/2010-08-04-StackVariable.ll
index 91fec3b..09e34ef 100644
--- a/test/CodeGen/X86/2010-08-04-StackVariable.ll
+++ b/test/CodeGen/X86/2010-08-04-StackVariable.ll
@@ -76,7 +76,7 @@ declare void @llvm.dbg.value(metadata, i64, metadata) nounwind readnone
 
 !llvm.dbg.cu = !{!3}
 !llvm.module.flags = !{!49}
-!46 = metadata !{metadata !0, metadata !9, metadata !16, metadata !17, metadata !20}
+!46 = metadata !{metadata !16, metadata !17, metadata !20}
 
 !0 = metadata !{i32 786478, metadata !47, metadata !1, metadata !"SVal", metadata !"SVal", metadata !"", i32 11, metadata !14, i1 false, i1 false, i32 0, i32 0, null, i1 false, i1 false, null, null, null, null, i32 11} ; [ DW_TAG_subprogram ]
 !1 = metadata !{i32 786451, metadata !47, metadata !2, metadata !"SVal", i32 1, i64 128, i64 64, i64 0, i32 0, null, metadata !4, i32 0, null, null, null} ; [ DW_TAG_structure_type ] [SVal] [line 1, size 128, align 64, offset 0] [def] [from ]
diff --git a/test/CodeGen/X86/MergeConsecutiveStores.ll b/test/CodeGen/X86/MergeConsecutiveStores.ll
index 0ef3aa5..f6d6852 100644
--- a/test/CodeGen/X86/MergeConsecutiveStores.ll
+++ b/test/CodeGen/X86/MergeConsecutiveStores.ll
@@ -1,4 +1,5 @@
 ; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx < %s | FileCheck %s
+; RUN: llc -march=x86-64 -mcpu=corei7 -mattr=+avx -addr-sink-using-gep=1 < %s | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-apple-macosx10.8.0"
diff --git a/test/CodeGen/X86/aliases.ll b/test/CodeGen/X86/aliases.ll
index d0a262d..8487c60 100644
--- a/test/CodeGen/X86/aliases.ll
+++ b/test/CodeGen/X86/aliases.ll
@@ -22,7 +22,7 @@ define i32 @foo_f() {
 @bar_i = alias internal i32* @bar
 
 ; CHECK-DAG: .globl	A
-@A = alias bitcast (i32* @bar to i64*)
+@A = alias i64, i32* @bar
 
 ; CHECK-DAG: .globl	bar_h
 ; CHECK-DAG: .hidden	bar_h
diff --git a/test/CodeGen/X86/atom-bypass-slow-division-64.ll b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
index d1b52a4..5980b79 100644
--- a/test/CodeGen/X86/atom-bypass-slow-division-64.ll
+++ b/test/CodeGen/X86/atom-bypass-slow-division-64.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mcpu=atom -mtriple=i686-linux -march=x86-64 | FileCheck %s
+; RUN: llc < %s -mcpu=atom -march=x86-64 | FileCheck %s
+
+target triple = "x86_64-unknown-linux-gnu"
 
 ; Additional tests for 64-bit divide bypass
 
diff --git a/test/CodeGen/X86/avoid_complex_am.ll b/test/CodeGen/X86/avoid_complex_am.ll
new file mode 100644
index 0000000..7f09519
--- /dev/null
+++ b/test/CodeGen/X86/avoid_complex_am.ll
@@ -0,0 +1,40 @@
+; RUN: opt -S -loop-reduce < %s | FileCheck %s
+; Complex addressing mode are costly.
+; Make loop-reduce prefer unscaled accesses.
+; On X86, reg1 + 1*reg2 has the same cost as reg1 + 8*reg2.
+; Therefore, LSR currently prefers to fold as much computation as possible
+; in the addressing mode.
+; <rdar://problem/16730541>
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+define void @mulDouble(double* nocapture %a, double* nocapture %b, double* nocapture %c) {
+; CHECK: @mulDouble
+entry:
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+; CHECK: [[IV:%[^ ]+]] = phi i64 [ [[IVNEXT:%[^,]+]], %for.body ], [ 0, %entry ]
+; Only one induction variable should have been generated.
+; CHECK-NOT: phi
+  %indvars.iv = phi i64 [ 1, %entry ], [ %indvars.iv.next, %for.body ]
+  %tmp = add nsw i64 %indvars.iv, -1
+  %arrayidx = getelementptr inbounds double* %b, i64 %tmp
+  %tmp1 = load double* %arrayidx, align 8
+; The induction variable should carry the scaling factor: 1.
+; CHECK: [[IVNEXT]] = add nuw nsw i64 [[IV]], 1
+  %indvars.iv.next = add i64 %indvars.iv, 1
+  %arrayidx2 = getelementptr inbounds double* %c, i64 %indvars.iv.next
+  %tmp2 = load double* %arrayidx2, align 8
+  %mul = fmul double %tmp1, %tmp2
+  %arrayidx4 = getelementptr inbounds double* %a, i64 %indvars.iv
+  store double %mul, double* %arrayidx4, align 8
+  %lftr.wideiv = trunc i64 %indvars.iv.next to i32
+; Comparison should be 19 * 1 = 19.
+; CHECK: icmp eq i32 {{%[^,]+}}, 19
+  %exitcond = icmp eq i32 %lftr.wideiv, 20
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
diff --git a/test/CodeGen/X86/avx-blend.ll b/test/CodeGen/X86/avx-blend.ll
index 5fcd5ff..e21c7a0 100644
--- a/test/CodeGen/X86/avx-blend.ll
+++ b/test/CodeGen/X86/avx-blend.ll
@@ -3,7 +3,16 @@
 ; AVX128 tests:
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: vblendvps
+; select mask is <i1 true, i1 false, i1 true, i1 false>.
+; Big endian representation is 0101 = 5.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 1010 = 10.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+; result is in xmm0 => destination argument.
+;CHECK: vblendps    $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x float> %v1, <4 x float> %v2
@@ -12,7 +21,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: vblendvps
+;CHECK: vblendps   $10, %xmm1, %xmm0, %xmm0
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x i32> %v1, <4 x i32> %v2
@@ -52,7 +61,13 @@ define <16 x i8> @vsel_i8(<16 x i8> %v1, <16 x i8> %v2) {
 
 ;CHECK-LABEL: vsel_float8:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK: ret
 define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x float> %v1, <8 x float> %v2
@@ -61,7 +76,7 @@ define <8 x float> @vsel_float8(<8 x float> %v1, <8 x float> %v2) {
 
 ;CHECK-LABEL: vsel_i328:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvps
+;CHECK: vblendps    $238, %ymm1, %ymm0, %ymm0
 ;CHECK-NEXT: ret
 define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i32> %v1, <8 x i32> %v2
@@ -69,7 +84,15 @@ define <8 x i32> @vsel_i328(<8 x i32> %v1, <8 x i32> %v2) {
 }
 
 ;CHECK-LABEL: vsel_double8:
-;CHECK: vblendvpd
+; select mask is 2x: 0001 => intel mask: ~0001 = 14
+; ABI:
+; v1 is in ymm0 and ymm1.
+; v2 is in ymm2 and ymm3.
+; result is in ymm0 and ymm1.
+; Compute the low part: res.low = blend v1.low, v2.low, blendmask
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+; Compute the high part.
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x double> %v1, <8 x double> %v2
@@ -77,7 +100,8 @@ define <8 x double> @vsel_double8(<8 x double> %v1, <8 x double> %v2) {
 }
 
 ;CHECK-LABEL: vsel_i648:
-;CHECK: vblendvpd
+;CHECK: vblendpd    $14, %ymm2, %ymm0, %ymm0
+;CHECK: vblendpd    $14, %ymm3, %ymm1, %ymm1
 ;CHECK: ret
 define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i64> %v1, <8 x i64> %v2
@@ -86,7 +110,7 @@ define <8 x i64> @vsel_i648(<8 x i64> %v1, <8 x i64> %v2) {
 
 ;CHECK-LABEL: vsel_double4:
 ;CHECK-NOT: vinsertf128
-;CHECK: vblendvpd
+;CHECK: vshufpd $10
 ;CHECK-NEXT: ret
 define <4 x double> @vsel_double4(<4 x double> %v1, <4 x double> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 false>, <4 x double> %v1, <4 x double> %v2
@@ -112,4 +136,25 @@ define <2 x double> @testb(<2 x double> %x, <2 x double> %y) {
   ret <2 x double> %min
 }
 
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <4 x double> @constant_blendvpd_avx(<4 x double> %xy, <4 x double> %ab) {
+; CHECK-LABEL: constant_blendvpd_avx:
+; CHECK-NOT: mov
+; CHECK: vblendpd
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 true, i1 false>, <4 x double> %xy, <4 x double> %ab
+  ret <4 x double> %1
+}
+
+define <8 x float> @constant_blendvps_avx(<8 x float> %xyzw, <8 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps_avx:
+; CHECK-NOT: mov
+; CHECK: vblendps
+; CHECK: ret
+  %1 = select <8 x i1> <i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false, i1 true>, <8 x float> %xyzw, <8 x float> %abcd
+  ret <8 x float> %1
+}
 
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 02aa617..f407ba4 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -306,3 +306,29 @@ define void @test20() {
   store <3 x double> %a1, <3 x double>* undef, align 1
   ret void
 }
+
+define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
+; CHECK-LABEL: test_insert_64_zext
+; CHECK-NOT: xor
+; CHECK: vmovq
+  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %1
+}
+
+;; Ensure we don't use insertps from non v4x32 vectors.
+;; On SSE4.1 it works because bigger vectors use more than 1 register.
+;; On AVX they get passed in a single register.
+;; FIXME: We could probably optimize this case, if we're only using the
+;; first 4 indices.
+define <4 x i32> @insert_from_diff_size(<8 x i32> %x) {
+; CHECK-LABEL: insert_from_diff_size:
+; CHECK-NOT: insertps
+; CHECK: ret
+  %vecext = extractelement <8 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %a.0 = extractelement <8 x i32> %x, i32 0
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %a.0, i32 3
+  ret <4 x i32> %vecinit3
+}
diff --git a/test/CodeGen/X86/avx.ll b/test/CodeGen/X86/avx.ll
new file mode 100644
index 0000000..6069c14
--- /dev/null
+++ b/test/CodeGen/X86/avx.ll
@@ -0,0 +1,136 @@
+; RUN: llc < %s -mtriple=i686-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
+
+define <4 x i32> @blendvb_fallback_v4i32(<4 x i1> %mask, <4 x i32> %x, <4 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v4i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <4 x i1> %mask, <4 x i32> %x, <4 x i32> %y
+  ret <4 x i32> %ret
+}
+
+define <8 x i32> @blendvb_fallback_v8i32(<8 x i1> %mask, <8 x i32> %x, <8 x i32> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8i32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i32> %x, <8 x i32> %y
+  ret <8 x i32> %ret
+}
+
+define <8 x float> @blendvb_fallback_v8f32(<8 x i1> %mask, <8 x float> %x, <8 x float> %y) {
+; CHECK-LABEL: @blendvb_fallback_v8f32
+; CHECK: vblendvps
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x float> %x, <8 x float> %y
+  ret <8 x float> %ret
+}
+
+declare <4 x float> @llvm.x86.sse41.insertps(<4 x float>, <4 x float>, i32) nounwind readnone
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: vinsertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: vbroadcastss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK: vaddps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/test/CodeGen/X86/avx1-logical-load-folding.ll b/test/CodeGen/X86/avx1-logical-load-folding.ll
new file mode 100644
index 0000000..32301b1
--- /dev/null
+++ b/test/CodeGen/X86/avx1-logical-load-folding.ll
@@ -0,0 +1,60 @@
+; RUN: llc -O3 -disable-peephole -mcpu=corei7-avx -mattr=+avx < %s | FileCheck %s
+
+target datalayout = "e-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.9.0"
+
+; Function Attrs: nounwind ssp uwtable
+define void @test1(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = and <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vandps LCPI0_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test2(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = or <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vorps LCPI1_0(%rip), %ymm0, %ymm0
+}
+
+; Function Attrs: nounwind ssp uwtable
+define void @test3(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp5 = bitcast <8 x i32> %tmp4 to <8 x float>
+  %tmp6 = extractelement <8 x float> %tmp5, i32 0
+  store float %tmp6, float* %C
+  ret void
+
+  ; CHECK: vxorps LCPI2_0(%rip), %ymm0, %ymm0
+}
+
+define void @test4(float* %A, float* %C) #0 {
+  %tmp1 = bitcast float* %A to <8 x float>*
+  %tmp2 = load <8 x float>* %tmp1, align 32
+  %tmp3 = bitcast <8 x float> %tmp2 to <8 x i32>
+  %tmp4 = xor <8 x i32> %tmp3, <i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1, i32 -1>
+  %tmp5 = and <8 x i32> %tmp4, <i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647, i32 2147483647>
+  %tmp6 = bitcast <8 x i32> %tmp5 to <8 x float>
+  %tmp7 = extractelement <8 x float> %tmp6, i32 0
+  store float %tmp7, float * %C
+  ret void
+
+  ;CHECK: vandnps LCPI3_0(%rip), %ymm0, %ymm0
+}
diff --git a/test/CodeGen/X86/avx2-blend.ll b/test/CodeGen/X86/avx2-blend.ll
new file mode 100644
index 0000000..b02442b
--- /dev/null
+++ b/test/CodeGen/X86/avx2-blend.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 | FileCheck %s
+
+define <32 x i8> @constant_pblendvb_avx2(<32 x i8> %xyzw, <32 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb_avx2:
+; CHECK: vmovdqa
+; CHECK: vpblendvb
+  %1 = select <32 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <32 x i8> %xyzw, <32 x i8> %abcd
+  ret <32 x i8> %1
+}
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
diff --git a/test/CodeGen/X86/avx2-vector-shifts.ll b/test/CodeGen/X86/avx2-vector-shifts.ll
index 4ae2905..e355301 100644
--- a/test/CodeGen/X86/avx2-vector-shifts.ll
+++ b/test/CodeGen/X86/avx2-vector-shifts.ll
@@ -52,6 +52,16 @@ entry:
 ; CHECK: vpaddd  %ymm0, %ymm0, %ymm0
 ; CHECK: ret
 
+define <8 x i32> @test_vpslld_var(i32 %shift) {
+  %amt = insertelement <8 x i32> undef, i32 %shift, i32 0
+  %tmp = shl <8 x i32> <i32 192, i32 193, i32 194, i32 195, i32 196, i32 197, i32 198, i32 199>, %amt
+  ret <8 x i32> %tmp
+}
+
+; CHECK-LABEL: test_vpslld_var:
+; CHECK: vpslld %xmm0, %ymm1, %ymm0
+; CHECK: ret
+
 define <8 x i32> @test_slld_3(<8 x i32> %InVec) {
 entry:
   %shl = shl <8 x i32> %InVec, <i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31, i32 31>
diff --git a/test/CodeGen/X86/avx512-cvt.ll b/test/CodeGen/X86/avx512-cvt.ll
index 1d83485..2476ea1 100644
--- a/test/CodeGen/X86/avx512-cvt.ll
+++ b/test/CodeGen/X86/avx512-cvt.ll
@@ -24,6 +24,22 @@ define <16 x i32> @fptoui00(<16 x float> %a) nounwind {
   ret <16 x i32> %b
 }
 
+; CHECK-LABEL: fptoui_256
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <8 x i32> @fptoui_256(<8 x float> %a) nounwind {
+  %b = fptoui <8 x float> %a to <8 x i32>
+  ret <8 x i32> %b
+}
+
+; CHECK-LABEL: fptoui_128
+; CHECK: vcvttps2udq
+; CHECK: ret
+define <4 x i32> @fptoui_128(<4 x float> %a) nounwind {
+  %b = fptoui <4 x float> %a to <4 x i32>
+  ret <4 x i32> %b
+}
+
 ; CHECK-LABEL: fptoui01
 ; CHECK: vcvttpd2udq
 ; CHECK: ret
@@ -184,6 +200,22 @@ define <16 x float> @uitof32(<16 x i32> %a) nounwind {
   ret <16 x float> %b
 }
 
+; CHECK-LABEL: uitof32_256
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <8 x float> @uitof32_256(<8 x i32> %a) nounwind {
+  %b = uitofp <8 x i32> %a to <8 x float>
+  ret <8 x float> %b
+}
+
+; CHECK-LABEL: uitof32_128
+; CHECK: vcvtudq2ps
+; CHECK: ret
+define <4 x float> @uitof32_128(<4 x i32> %a) nounwind {
+  %b = uitofp <4 x i32> %a to <4 x float>
+  ret <4 x float> %b
+}
+
 ; CHECK-LABEL: @fptosi02
 ; CHECK: vcvttss2si {{.*}} encoding: [0x62
 ; CHECK: ret
diff --git a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
index e429a22..20bf7e4 100644
--- a/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
+++ b/test/CodeGen/X86/avx512-gather-scatter-intrin.ll
@@ -1,14 +1,14 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=knl | FileCheck %s
 
-declare <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.mask.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpd.mask.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
+declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dps.512 (i8*, i16, <16 x i32>, <16 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpd.512 (i8*, i8, <8 x i32>, <8 x double>, i32)
 
-declare <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.mask.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
+declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qps.512 (i8*, i8, <8 x i64>, <8 x float>, i32)
+declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, i8, <8 x i64>, <8 x double>, i32)
 
 ;CHECK-LABEL: gather_mask_dps
 ;CHECK: kmovw
@@ -17,9 +17,9 @@ declare void @llvm.x86.avx512.scatter.qpd.mask.512 (i8*, i8, <8 x i64>, <8 x dou
 ;CHECK: vscatterdps
 ;CHECK: ret
 define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x float> %x, i32 4)
   ret void
 }
 
@@ -30,9 +30,9 @@ define void @gather_mask_dps(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8*
 ;CHECK: vscatterdpd
 ;CHECK: ret
 define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -43,9 +43,9 @@ define void @gather_mask_dpd(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %b
 ;CHECK: vscatterqps
 ;CHECK: ret
 define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x float> %x, i32 4)
   ret void
 }
 
@@ -56,23 +56,23 @@ define void @gather_mask_qps(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %ba
 ;CHECK: vscatterqpd
 ;CHECK: ret
 define void @gather_mask_qpd(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x double> %x, i32 4)
   ret void
 }
 ;;
 ;; Integer Gather/Scatter
 ;;
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32>, i16, <16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.mask.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64>, i8, <8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.mask.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
+declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, <16 x i32>, i16, i32)
+declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, i16, <16 x i32>, <16 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64>, i8*, <8 x i32>, i8, i32)
+declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, i8, <8 x i32>, <8 x i64>, i32)
 
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.mask.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64>, i8, <8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
+declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, i8, <8 x i64>, <8 x i32>, i32)
+declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, <8 x i64>, i8, i32)
+declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, i8, <8 x i64>, <8 x i64>, i32)
 
 ;CHECK-LABEL: gather_mask_dd
 ;CHECK: kmovw
@@ -81,9 +81,9 @@ declare void @llvm.x86.avx512.scatter.qpq.mask.512 (i8*, i8, <8 x i64>, <8 x i64
 ;CHECK: vpscatterdd
 ;CHECK: ret
 define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.mask.512 (<16 x i32> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpi.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind2, <16 x i32> %x, i32 4)
   ret void
 }
 
@@ -94,9 +94,9 @@ define void @gather_mask_dd(<16 x i32> %ind, <16 x i32> %src, i16 %mask, i8* %ba
 ;CHECK: vpscatterqd
 ;CHECK: ret
 define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.mask.512 (<8 x i32> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i32> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpi.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i32> %x, i32 4)
   ret void
 }
 
@@ -107,9 +107,9 @@ define void @gather_mask_qd(<8 x i64> %ind, <8 x i32> %src, i8 %mask, i8* %base,
 ;CHECK: vpscatterqq
 ;CHECK: ret
 define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind2, <8 x i64> %x, i32 4)
   ret void
 }
 
@@ -120,116 +120,19 @@ define void @gather_mask_qq(<8 x i64> %ind, <8 x i64> %src, i8 %mask, i8* %base,
 ;CHECK: vpscatterdq
 ;CHECK: ret
 define void @gather_mask_dq(<8 x i32> %ind, <8 x i64> %src, i8 %mask, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.mask.512 (<8 x i64> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i64> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   %ind2 = add <8 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpq.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpq.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind2, <8 x i64> %x, i32 4)
   ret void
 }
 
-;; FP Intinsics without masks
-
-declare <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dps.512 (i8*, <16 x i32>, <16 x float>, i32)
-declare <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qps.512 (i8*, <8 x i64>, <8 x float>, i32)
-declare <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpd.512 (i8*, <8 x i64>, <8 x double>, i32)
-
-;CHECK-LABEL: gather_dps
-;CHECK: kxnorw
-;CHECK: vgatherdps
-;CHECK: vscatterdps
-;CHECK: ret
-define void @gather_dps(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x i32>%ind, i8* %base, i32 4)
-  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, <16 x i32>%ind2, <16 x float> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qps
-;CHECK: kxnorw
-;CHECK: vgatherqps
-;CHECK: vscatterqps
-;CHECK: ret
-define void @gather_qps(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, <8 x i64>%ind2, <8 x float> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpd
-;CHECK: kxnorw
-;CHECK: vgatherqpd
-;CHECK: vpadd
-;CHECK: vscatterqpd
-;CHECK: ret
-define void @gather_qpd(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, <8 x i64>%ind2, <8 x double> %x, i32 4)
-  ret void
-}
-
-;; Integer Intinsics without masks
-
-declare <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpi.512 (i8*, <16 x i32>, <16 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.dpq.512 (<8 x i32>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.dpq.512 (i8*, <8 x i32>, <8 x i64>, i32)
-
-declare <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpi.512 (i8*, <8 x i64>, <8 x i32>, i32)
-declare <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>, i8*, i32)
-declare void @llvm.x86.avx512.scatter.qpq.512 (i8*, <8 x i64>, <8 x i64>, i32)
-
-;CHECK-LABEL: gather_dpi
-;CHECK: kxnorw
-;CHECK: vpgatherdd
-;CHECK: vpscatterdd
-;CHECK: ret
-define void @gather_dpi(<16 x i32> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <16 x i32> @llvm.x86.avx512.gather.dpi.512 (<16 x i32>%ind, i8* %base, i32 4)
-  %ind2 = add <16 x i32> %ind, <i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3, i32 0, i32 1, i32 2, i32 3>
-  call void @llvm.x86.avx512.scatter.dpi.512 (i8* %stbuf, <16 x i32>%ind2, <16 x i32> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpq
-;CHECK: vpxord  %zmm
-;CHECK: kxnorw
-;CHECK: vpgatherqq
-;CHECK: vpadd
-;CHECK: vpscatterqq
-;CHECK: ret
-define void @gather_qpq(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i64> @llvm.x86.avx512.gather.qpq.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpq.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i64> %x, i32 4)
-  ret void
-}
-
-;CHECK-LABEL: gather_qpi
-;CHECK: vpxor %ymm
-;CHECK: kxnorw
-;CHECK: vpgatherqd
-;CHECK: vpadd
-;CHECK: vpscatterqd
-;CHECK: ret
-define void @gather_qpi(<8 x i64> %ind, i8* %base, i8* %stbuf)  {
-  %x = call <8 x i32> @llvm.x86.avx512.gather.qpi.512 (<8 x i64>%ind, i8* %base, i32 4)
-  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
-  call void @llvm.x86.avx512.scatter.qpi.512 (i8* %stbuf, <8 x i64>%ind2, <8 x i32> %x, i32 4)
-  ret void
-}
 
 ;CHECK-LABEL: gather_mask_dpd_execdomain
 ;CHECK: vgatherdpd
 ;CHECK: vmovapd
 ;CHECK: ret
 define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i32>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.dpd.512 (<8 x double> %src, i8* %base, <8 x i32>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
   ret void
 }
@@ -239,7 +142,7 @@ define void @gather_mask_dpd_execdomain(<8 x i32> %ind, <8 x double> %src, i8 %m
 ;CHECK: vmovapd
 ;CHECK: ret
 define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %mask, i8* %base, <8 x double>* %stbuf)  {
-  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.mask.512 (<8 x double> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %x = call <8 x double> @llvm.x86.avx512.gather.qpd.512 (<8 x double> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   store <8 x double> %x, <8 x double>* %stbuf
   ret void
 }
@@ -249,7 +152,7 @@ define void @gather_mask_qpd_execdomain(<8 x i64> %ind, <8 x double> %src, i8 %m
 ;CHECK: vmovaps 
 ;CHECK: ret
 define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %src, i16 %mask, i8* %base)  {
-  %res = call <16 x float> @llvm.x86.avx512.gather.dps.mask.512 (<16 x float> %src, i16 %mask, <16 x i32>%ind, i8* %base, i32 4)
+  %res = call <16 x float> @llvm.x86.avx512.gather.dps.512 (<16 x float> %src, i8* %base, <16 x i32>%ind, i16 %mask, i32 4)
   ret <16 x float> %res;
 }
 
@@ -258,7 +161,7 @@ define <16 x float> @gather_mask_dps_execdomain(<16 x i32> %ind, <16 x float> %s
 ;CHECK: vmovaps
 ;CHECK: ret
 define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src, i8 %mask, i8* %base)  {
-  %res = call <8 x float> @llvm.x86.avx512.gather.qps.mask.512 (<8 x float> %src, i8 %mask, <8 x i64>%ind, i8* %base, i32 4)
+  %res = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 %mask, i32 4)
   ret <8 x float> %res;
 }
 
@@ -268,7 +171,7 @@ define <8 x float> @gather_mask_qps_execdomain(<8 x i64> %ind, <8 x float> %src,
 ;CHECK: ret
 define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x double>* %src, align 64 
-  call void @llvm.x86.avx512.scatter.dpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dpd.512 (i8* %stbuf, i8 %mask, <8 x i32>%ind, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -278,7 +181,7 @@ define void @scatter_mask_dpd_execdomain(<8 x i32> %ind, <8 x double>* %src, i8
 ;CHECK: ret
 define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x double>* %src, align 64
-  call void @llvm.x86.avx512.scatter.qpd.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qpd.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x double> %x, i32 4)
   ret void
 }
 
@@ -288,7 +191,7 @@ define void @scatter_mask_qpd_execdomain(<8 x i64> %ind, <8 x double>* %src, i8
 ;CHECK: ret
 define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i16 %mask, i8* %base, i8* %stbuf)  {
   %x = load <16 x float>* %src, align 64
-  call void @llvm.x86.avx512.scatter.dps.mask.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.dps.512 (i8* %stbuf, i16 %mask, <16 x i32>%ind, <16 x float> %x, i32 4)
   ret void
 }
 
@@ -298,6 +201,35 @@ define void @scatter_mask_dps_execdomain(<16 x i32> %ind, <16 x float>* %src, i1
 ;CHECK: ret
 define void @scatter_mask_qps_execdomain(<8 x i64> %ind, <8 x float>* %src, i8 %mask, i8* %base, i8* %stbuf)  {
   %x = load <8 x float>* %src, align 32 
-  call void @llvm.x86.avx512.scatter.qps.mask.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 %mask, <8 x i64>%ind, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: gather_qps
+;CHECK: kxnorw
+;CHECK: vgatherqps
+;CHECK: vpadd
+;CHECK: vscatterqps
+;CHECK: ret
+define void @gather_qps(<8 x i64> %ind, <8 x float> %src, i8* %base, i8* %stbuf)  {
+  %x = call <8 x float> @llvm.x86.avx512.gather.qps.512 (<8 x float> %src, i8* %base, <8 x i64>%ind, i8 -1, i32 4)
+  %ind2 = add <8 x i64> %ind, <i64 0, i64 1, i64 2, i64 3, i64 0, i64 1, i64 2, i64 3>
+  call void @llvm.x86.avx512.scatter.qps.512 (i8* %stbuf, i8 -1, <8 x i64>%ind2, <8 x float> %x, i32 4)
+  ret void
+}
+
+;CHECK-LABEL: prefetch
+;CHECK: gatherpf0
+;CHECK: gatherpf1
+;CHECK: scatterpf0
+;CHECK: scatterpf1
+;CHECK: ret
+declare  void @llvm.x86.avx512.gatherpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+declare  void @llvm.x86.avx512.scatterpf.qps.512(i8, <8 x i64>, i8* , i32, i32);
+define void @prefetch(<8 x i64> %ind, i8* %base) {
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 0)
+  call void @llvm.x86.avx512.gatherpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 4, i32 1)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 0)
+  call void @llvm.x86.avx512.scatterpf.qps.512(i8 -1, <8 x i64> %ind, i8* %base, i32 2, i32 1)
   ret void
 }
diff --git a/test/CodeGen/X86/avx512-insert-extract.ll b/test/CodeGen/X86/avx512-insert-extract.ll
index 6557ac3..b360c71 100644
--- a/test/CodeGen/X86/avx512-insert-extract.ll
+++ b/test/CodeGen/X86/avx512-insert-extract.ll
@@ -158,3 +158,41 @@ define i64 @test14(<8 x i64>%a, <8 x i64>%b, i64 %a1, i64 %b1) {
   %res = select i1 %extract24vector_func.i, i64 %a1, i64 %b1
   ret i64 %res
 }
+
+;CHECK-LABEL: test15
+;CHECK: kshiftlw
+;CHECK: kmovw
+;CHECK: ret
+define i16 @test15(i1 *%addr) {
+  %x = load i1 * %addr, align 128
+  %x1 = insertelement <16 x i1> undef, i1 %x, i32 10
+  %x2 = bitcast <16 x i1>%x1 to i16
+  ret i16 %x2
+}
+
+;CHECK-LABEL: test16
+;CHECK: kshiftlw
+;CHECK: kshiftrw
+;CHECK: korw
+;CHECK: ret
+define i16 @test16(i1 *%addr, i16 %a) {
+  %x = load i1 * %addr, align 128
+  %a1 = bitcast i16 %a to <16 x i1>
+  %x1 = insertelement <16 x i1> %a1, i1 %x, i32 10
+  %x2 = bitcast <16 x i1>%x1 to i16
+  ret i16 %x2
+}
+
+;CHECK-LABEL: test17
+;CHECK: kshiftlw
+;CHECK: kshiftrw
+;CHECK: korw
+;CHECK: ret
+define i8 @test17(i1 *%addr, i8 %a) {
+  %x = load i1 * %addr, align 128
+  %a1 = bitcast i8 %a to <8 x i1>
+  %x1 = insertelement <8 x i1> %a1, i1 %x, i32 10
+  %x2 = bitcast <8 x i1>%x1 to i8
+  ret i8 %x2
+}
+
diff --git a/test/CodeGen/X86/avx512-intrinsics.ll b/test/CodeGen/X86/avx512-intrinsics.ll
index 3fb38ed..e19841a 100644
--- a/test/CodeGen/X86/avx512-intrinsics.ll
+++ b/test/CodeGen/X86/avx512-intrinsics.ll
@@ -78,7 +78,7 @@ declare <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double>, i32, <8
 
 define <8 x double> @test7(<8 x double> %a) {
 ; CHECK: vrndscalepd {{.*}}encoding: [0x62,0xf3,0xfd,0x48,0x09,0xc0,0x0b]
-  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> zeroinitializer, i8 -1, i32 4)
+  %res = call <8 x double> @llvm.x86.avx512.mask.rndscale.pd.512(<8 x double> %a, i32 11, <8 x double> %a, i8 -1, i32 4)
   ret <8 x double>%res
 }
 
@@ -86,7 +86,7 @@ declare <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float>, i32, <1
 
 define <16 x float> @test8(<16 x float> %a) {
 ; CHECK: vrndscaleps {{.*}}encoding: [0x62,0xf3,0x7d,0x48,0x08,0xc0,0x0b]
-  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> zeroinitializer, i16 -1, i32 4)
+  %res = call <16 x float> @llvm.x86.avx512.mask.rndscale.ps.512(<16 x float> %a, i32 11, <16 x float> %a, i16 -1, i32 4)
   ret <16 x float>%res
 }
 
@@ -536,4 +536,12 @@ define void @test_store2(<8 x double> %data, i8* %ptr, i8 %mask) {
   ret void
 }
 
-declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
-\ No newline at end of file
+declare void @llvm.x86.avx512.mask.storeu.pd.512(i8*, <8 x double>, i8 )
+
+define <16 x float> @test_vpermt2ps(<16 x float>%x, <16 x float>%y, <16 x i32>%perm) {
+; CHECK: vpermt2ps {{.*}}encoding: [0x62,0xf2,0x6d,0x48,0x7f,0xc1]
+  %res = call <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>%perm, <16 x float>%x, <16 x float>%y, i16 -1)
+  ret <16 x float> %res
+}
+
+declare <16 x float> @llvm.x86.avx512.mask.vpermt.ps.512(<16 x i32>, <16 x float>, <16 x float>, i16)
diff --git a/test/CodeGen/X86/avx512-mov.ll b/test/CodeGen/X86/avx512-mov.ll
index 13e6843..009802f 100644
--- a/test/CodeGen/X86/avx512-mov.ll
+++ b/test/CodeGen/X86/avx512-mov.ll
@@ -153,3 +153,31 @@ define void @test18(i8 * %addr, <8 x i64> %data) {
   ret void
 }
 
+; CHECK-LABEL: store_i1_1
+; CHECK: movb
+; CHECK: movb
+; CHECK: ret
+define void @store_i1_1() {
+  store i1 true, i1 addrspace(3)* undef, align 128
+  store i1 false, i1 addrspace(2)* undef, align 128
+  ret void
+}
+
+; CHECK-LABEL: store_i1_2
+; CHECK: movb
+; CHECK: ret
+define void @store_i1_2(i64 %a, i64 %b) {
+  %res = icmp eq i64 %a, %b
+  store i1 %res, i1 addrspace(3)* undef, align 128
+  ret void
+}
+
+; CHECK-LABEL: store_i1_3
+; CHECK: kmovw
+; CHECK: ret
+define void @store_i1_3(i16 %a) {
+  %a_vec = bitcast i16 %a to <16 x i1>
+  %res = extractelement <16 x i1> %a_vec, i32 4
+  store i1 %res, i1 addrspace(3)* undef, align 128
+  ret void
+}
diff --git a/test/CodeGen/X86/avx512-shuffle.ll b/test/CodeGen/X86/avx512-shuffle.ll
index 59d7010..23ddc3a 100644
--- a/test/CodeGen/X86/avx512-shuffle.ll
+++ b/test/CodeGen/X86/avx512-shuffle.ll
@@ -231,3 +231,22 @@ define <16 x i32> @test27(<4 x i32>%a) {
  %res = shufflevector <4 x i32> %a, <4 x i32> undef, <16 x i32> <i32 0, i32 1, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef, i32 undef>
  ret <16 x i32> %res
 }
+
+; CHECK-LABEL: @test28
+; CHECK: vinserti64x4 $1
+; CHECK: ret
+define <16 x i32> @test28(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7,
+                                                              i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+ ret <16 x i32> %res
+}
+
+; CHECK-LABEL: @test29
+; CHECK: vinserti64x4 $0
+; CHECK: ret
+define <16 x i32> @test29(<16 x i32>%x, <16 x i32>%y) {
+ %res = shufflevector <16 x i32>%x, <16 x i32>%y, <16 x i32> <i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23,
+                                                              i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15>
+ ret <16 x i32> %res
+}
+
diff --git a/test/CodeGen/X86/blend-msb.ll b/test/CodeGen/X86/blend-msb.ll
index 6b46596..34aaf2c 100644
--- a/test/CodeGen/X86/blend-msb.ll
+++ b/test/CodeGen/X86/blend-msb.ll
@@ -4,7 +4,7 @@
 ; Verify that we produce movss instead of blendvps when possible.
 
 ;CHECK-LABEL: vsel_float:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
@@ -13,7 +13,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK-NOT: blendvps
+;CHECK-NOT: blend
 ;CHECK: movss
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
@@ -21,14 +21,18 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   ret <4 x i8> %vsel
 }
 
-
-; We do not have native support for v8i16 blends and we have to use the
-; blendvb instruction or a sequence of NAND/OR/AND. Make sure that we do not
-; reduce the mask in this case.
 ;CHECK-LABEL: vsel_8xi16:
-;CHECK: andps
-;CHECK: andps
-;CHECK: orps
+; The select mask is
+; <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>
+; which translates into the boolean mask (big endian representation):
+; 00010001 = 17.
+; '1' means takes the first argument, '0' means takes the second argument.
+; This is the opposite of the intel syntax, thus we expect
+; the inverted mask: 11101110 = 238.
+; According to the ABI:
+; v1 is in xmm0 => first argument is xmm0.
+; v2 is in xmm1 => second argument is xmm1.
+;CHECK: pblendw $238, %xmm1, %xmm0
 ;CHECK: ret
 define <8 x i16> @vsel_8xi16(<8 x i16> %v1, <8 x i16> %v2) {
   %vsel = select <8 x i1> <i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 false, i1 false>, <8 x i16> %v1, <8 x i16> %v2
diff --git a/test/CodeGen/X86/bmi.ll b/test/CodeGen/X86/bmi.ll
index 242075a..a707209 100644
--- a/test/CodeGen/X86/bmi.ll
+++ b/test/CodeGen/X86/bmi.ll
@@ -216,6 +216,23 @@ entry:
 ; CHECK: bzhiq
 }
 
+define i64 @bzhi64_constant_mask(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 4611686018427387903
+  ret i64 %and
+; CHECK-LABEL: bzhi64_constant_mask:
+; CHECK: movb    $62, %al
+; CHECK: bzhiq   %rax, %r[[ARG1:di|cx]], %rax
+}
+
+define i64 @bzhi64_small_constant_mask(i64 %x) #0 {
+entry:
+  %and = and i64 %x, 2147483647
+  ret i64 %and
+; CHECK-LABEL: bzhi64_small_constant_mask:
+; CHECK: andq  $2147483647, %r[[ARG1]]
+}
+
 define i32 @blsi32(i32 %x) nounwind readnone {
   %tmp = sub i32 0, %x
   %tmp2 = and i32 %x, %tmp
diff --git a/test/CodeGen/X86/br-fold.ll b/test/CodeGen/X86/br-fold.ll
index 5223463..fd1e73b 100644
--- a/test/CodeGen/X86/br-fold.ll
+++ b/test/CodeGen/X86/br-fold.ll
@@ -1,7 +1,19 @@
-; RUN: llc -march=x86-64 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-apple-darwin < %s | FileCheck  -check-prefix=X64_DARWIN %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck  -check-prefix=X64_LINUX %s
+; RUN: llc -mtriple=x86_64-pc-windows < %s | FileCheck  -check-prefix=X64_WINDOWS %s
+; RUN: llc -mtriple=x86_64-pc-windows-gnu < %s | FileCheck  -check-prefix=X64_WINDOWS_GNU %s
 
-; CHECK: orq
-; CHECK-NEXT: %bb8.i329
+; X64_DARWIN: orq
+; X64_DARWIN-NEXT: %bb8.i329
+
+; X64_LINUX: orq %rax, %rcx
+; X64_LINUX-NEXT: %bb8.i329
+
+; X64_WINDOWS: orq %rax, %rcx
+; X64_WINDOWS-NEXT: ud2
+
+; X64_WINDOWS_GNU: orq %rax, %rcx
+; X64_WINDOWS_GNU-NEXT: ud2
 
 @_ZN11xercesc_2_513SchemaSymbols21fgURI_SCHEMAFORSCHEMAE = external constant [33 x i16], align 32 ; <[33 x i16]*> [#uses=1]
 @_ZN11xercesc_2_56XMLUni16fgNotationStringE = external constant [9 x i16], align 16 ; <[9 x i16]*> [#uses=1]
diff --git a/test/CodeGen/X86/bswap-vector.ll b/test/CodeGen/X86/bswap-vector.ll
index 6b77176..3c931db 100644
--- a/test/CodeGen/X86/bswap-vector.ll
+++ b/test/CodeGen/X86/bswap-vector.ll
@@ -1,19 +1,144 @@
-; RUN: llc < %s -mcpu=x86_64 | FileCheck %s
+; RUN: llc < %s -mcpu=x86-64 | FileCheck %s -check-prefix=CHECK-NOSSSE3
+; RUN: llc < %s -mcpu=core2 | FileCheck %s -check-prefix=CHECK-SSSE3
+; RUN: llc < %s -mcpu=core-avx2 | FileCheck %s -check-prefix=CHECK-AVX2
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
+declare <8 x i16> @llvm.bswap.v8i16(<8 x i16>)
+declare <4 x i32> @llvm.bswap.v4i32(<4 x i32>)
 declare <2 x i64> @llvm.bswap.v2i64(<2 x i64>)
 
-define <2 x i64> @foo(<2 x i64> %v) #0 {
+define <8 x i16> @test1(<8 x i16> %v) #0 {
+entry:
+  %r = call <8 x i16> @llvm.bswap.v8i16(<8 x i16> %v)
+  ret <8 x i16> %r
+
+; CHECK-NOSSSE3-LABEL: @test1
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: rolw
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test1
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test1
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i32> @test2(<4 x i32> %v) #0 {
+entry:
+  %r = call <4 x i32> @llvm.bswap.v4i32(<4 x i32> %v)
+  ret <4 x i32> %r
+
+; CHECK-NOSSSE3-LABEL: @test2
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: bswapl
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test2
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test2
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <2 x i64> @test3(<2 x i64> %v) #0 {
 entry:
   %r = call <2 x i64> @llvm.bswap.v2i64(<2 x i64> %v)
   ret <2 x i64> %r
+
+; CHECK-NOSSSE3-LABEL: @test3
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: bswapq
+; CHECK-NOSSSE3: retq
+
+; CHECK-SSSE3-LABEL: @test3
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test3
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+declare <16 x i16> @llvm.bswap.v16i16(<16 x i16>)
+declare <8 x i32> @llvm.bswap.v8i32(<8 x i32>)
+declare <4 x i64> @llvm.bswap.v4i64(<4 x i64>)
+
+define <16 x i16> @test4(<16 x i16> %v) #0 {
+entry:
+  %r = call <16 x i16> @llvm.bswap.v16i16(<16 x i16> %v)
+  ret <16 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test4
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test4
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <8 x i32> @test5(<8 x i32> %v) #0 {
+entry:
+  %r = call <8 x i32> @llvm.bswap.v8i32(<8 x i32> %v)
+  ret <8 x i32> %r
+
+; CHECK-SSSE3-LABEL: @test5
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test5
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
+}
+
+define <4 x i64> @test6(<4 x i64> %v) #0 {
+entry:
+  %r = call <4 x i64> @llvm.bswap.v4i64(<4 x i64> %v)
+  ret <4 x i64> %r
+
+; CHECK-SSSE3-LABEL: @test6
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test6
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2-NEXT: retq
 }
 
-; CHECK-LABEL: @foo
-; CHECK: bswapq
-; CHECK: bswapq
-; CHECK: retq
+declare <4 x i16> @llvm.bswap.v4i16(<4 x i16>)
+
+define <4 x i16> @test7(<4 x i16> %v) #0 {
+entry:
+  %r = call <4 x i16> @llvm.bswap.v4i16(<4 x i16> %v)
+  ret <4 x i16> %r
+
+; CHECK-SSSE3-LABEL: @test7
+; CHECK-SSSE3: pshufb
+; CHECK-SSSE3: psrld $16
+; CHECK-SSSE3-NEXT: retq
+
+; CHECK-AVX2-LABEL: @test7
+; CHECK-AVX2: vpshufb
+; CHECK-AVX2: vpsrld $16
+; CHECK-AVX2-NEXT: retq
+}
 
 attributes #0 = { nounwind uwtable }
 
diff --git a/test/CodeGen/X86/cdecl-method-return.ll b/test/CodeGen/X86/cdecl-method-return.ll
deleted file mode 100644
index 2baa47a..0000000
--- a/test/CodeGen/X86/cdecl-method-return.ll
+++ /dev/null
@@ -1,69 +0,0 @@
-; RUN: llc < %s -mtriple=i686-pc-win32 -mcpu=core2 | FileCheck %s
-
-; The sret flag causes the first two parameters to be reordered on the stack.
-
-define x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src) {
-  %v = load i32* %src
-  store i32 %v, i32* %dst
-  ret void
-}
-
-; CHECK-LABEL: _foo:
-; CHECK:  movl    8(%esp), %[[dst:[^ ]*]]
-; CHECK:  movl    4(%esp), %[[src:[^ ]*]]
-; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
-; CHECK:  movl    %[[v]], (%[[dst]])
-; CHECK:  retl
-
-define i32 @bar() {
-  %src = alloca i32
-  %dst = alloca i32
-  store i32 42, i32* %src
-  call x86_cdeclmethodcc void @foo(i32* sret %dst, i32* %src)
-  %v = load i32* %dst
-  ret i32 %v
-}
-
-; CHECK-LABEL: _bar:
-; CHECK:  movl    $42, [[src:[^,]*]]
-; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], (%esp)
-; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], 4(%esp)
-; CHECK:  calll   _foo
-; CHECK:  movl    [[dst]], %eax
-; CHECK:  retl
-
-; If we don't have the sret flag, parameters are not reordered.
-
-define x86_cdeclmethodcc void @baz(i32* %dst, i32* %src) {
-  %v = load i32* %src
-  store i32 %v, i32* %dst
-  ret void
-}
-
-; CHECK-LABEL: _baz:
-; CHECK:  movl    4(%esp), %[[dst:[^ ]*]]
-; CHECK:  movl    8(%esp), %[[src:[^ ]*]]
-; CHECK:  movl    (%[[src]]), %[[v:[^ ]*]]
-; CHECK:  movl    %[[v]], (%[[dst]])
-; CHECK:  retl
-
-define i32 @qux() {
-  %src = alloca i32
-  %dst = alloca i32
-  store i32 42, i32* %src
-  call x86_cdeclmethodcc void @baz(i32* %dst, i32* %src)
-  %v = load i32* %dst
-  ret i32 %v
-}
-
-; CHECK-LABEL: _qux:
-; CHECK:  movl    $42, [[src:[^,]*]]
-; CHECK:  leal    [[src]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], 4(%esp)
-; CHECK:  leal    [[dst:[^,]*]], %[[reg:[^ ]*]]
-; CHECK:  movl    %[[reg]], (%esp)
-; CHECK:  calll   _baz
-; CHECK:  movl    [[dst]], %eax
-; CHECK:  retl
diff --git a/test/CodeGen/X86/cfi.ll b/test/CodeGen/X86/cfi.ll
new file mode 100644
index 0000000..b57ff45
--- /dev/null
+++ b/test/CodeGen/X86/cfi.ll
@@ -0,0 +1,27 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck --check-prefix=STATIC %s
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -relocation-model=pic | FileCheck --check-prefix=PIC %s
+
+; STATIC: .cfi_personality 3, __gxx_personality_v0
+; STATIC: .cfi_lsda 3, .Lexception0
+
+; PIC: .cfi_personality 155, DW.ref.__gxx_personality_v0
+; PIC: .cfi_lsda 27, .Lexception0
+
+
+define void @bar() {
+entry:
+  %call = invoke i32 @foo()
+          to label %invoke.cont unwind label %lpad
+
+invoke.cont:
+  ret void
+
+lpad:
+  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
+            catch i8* null
+  ret void
+}
+
+declare i32 @foo()
+
+declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/cmp.ll b/test/CodeGen/X86/cmp.ll
index 551d9bc..cdcdc96 100644
--- a/test/CodeGen/X86/cmp.ll
+++ b/test/CodeGen/X86/cmp.ll
@@ -26,9 +26,22 @@ cond_true:		; preds = %0
 ReturnBlock:		; preds = %0
 	ret i32 0
 ; CHECK-LABEL: test2:
-; CHECK: movl	(%rsi), %eax
-; CHECK: shll	$3, %eax
-; CHECK: testl	%eax, %eax
+; CHECK: testl	$536870911, (%rsi)
+}
+
+define i8 @test2b(i8 %X, i8* %y) nounwind {
+	%tmp = load i8* %y		; <i8> [#uses=1]
+	%tmp1 = shl i8 %tmp, 3		; <i8> [#uses=1]
+	%tmp1.upgrd.2 = icmp eq i8 %tmp1, 0		; <i1> [#uses=1]
+	br i1 %tmp1.upgrd.2, label %ReturnBlock, label %cond_true
+
+cond_true:		; preds = %0
+	ret i8 1
+
+ReturnBlock:		; preds = %0
+	ret i8 0
+; CHECK-LABEL: test2b:
+; CHECK: testb	$31, (%rsi)
 }
 
 define i64 @test3(i64 %x) nounwind {
@@ -68,8 +81,8 @@ define i32 @test5(double %A) nounwind  {
  bb12:; preds = %entry
  ret i32 32
 ; CHECK-LABEL: test5:
-; CHECK: ucomisd	LCPI4_0(%rip), %xmm0
-; CHECK: ucomisd	LCPI4_1(%rip), %xmm0
+; CHECK: ucomisd	LCPI5_0(%rip), %xmm0
+; CHECK: ucomisd	LCPI5_1(%rip), %xmm0
 }
 
 declare i32 @foo(...)
@@ -163,3 +176,25 @@ define i32 @test12() uwtable ssp {
 }
 
 declare zeroext i1 @test12b()
+
+define i32 @test13(i32 %mask, i32 %base, i32 %intra) {
+  %and = and i32 %mask, 8
+  %tobool = icmp ne i32 %and, 0
+  %cond = select i1 %tobool, i32 %intra, i32 %base
+  ret i32 %cond
+
+; CHECK-LABEL: test13:
+; CHECK: testb	$8, %dil
+; CHECK: cmovnel
+}
+
+define i32 @test14(i32 %mask, i32 %base, i32 %intra) #0 {
+  %s = lshr i32 %mask, 7
+  %tobool = icmp sgt i32 %s, -1
+  %cond = select i1 %tobool, i32 %intra, i32 %base
+  ret i32 %cond
+
+; CHECK-LABEL: test14:
+; CHECK: 	shrl	$7, %edi
+; CHECK-NEXT: 	cmovnsl	%edx, %esi
+}
diff --git a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
index e3d6b34..78e1dd2 100644
--- a/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
+++ b/test/CodeGen/X86/codegen-prepare-addrmode-sext.ll
@@ -1,4 +1,5 @@
 ; RUN: opt -S -codegenprepare %s -o - | FileCheck %s
+; RUN: opt -S -codegenprepare -addr-sink-using-gep=1 %s -o - | FileCheck -check-prefix=CHECK-GEP %s
 ; This file tests the different cases what are involved when codegen prepare
 ; tries to get sign extension out of the way of addressing mode.
 ; This tests require an actual target as addressing mode decisions depends
@@ -281,6 +282,25 @@ define i8 @twoArgsNoPromotionRemove(i1 %arg1, i8 %arg2, i8* %base) {
 ; CHECK: [[ADDR2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[BASE2]] to i32*
 ; CHECK: load i32* [[ADDR2]]
 ; CHECK: ret
+; CHECK-GEP-LABEL: @checkProfitability
+; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg1 to i64
+; CHECK-GEP-NOT: {{%[a-zA-Z_0-9-]+}} = sext i32 %arg2 to i64
+; CHECK-GEP: [[SHL:%[a-zA-Z_0-9-]+]] = shl nsw i32 %arg1, 1
+; CHECK-GEP: [[ADD:%[a-zA-Z_0-9-]+]] = add nsw i32 [[SHL]], %arg2
+; CHECK-GEP: [[SEXTADD:%[a-zA-Z_0-9-]+]] = sext i32 [[ADD]] to i64
+; BB then
+; CHECK-GEP: [[BASE1:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK-GEP: [[BCC1:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE1]] to i8*
+; CHECK-GEP: [[FULL1:%[a-zA-Z_0-9-]+]] = getelementptr i8* [[BCC1]], i64 48
+; CHECK-GEP: [[ADDR1:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL1]] to i32*
+; CHECK-GEP: load i32* [[ADDR1]]
+; BB else
+; CHECK-GEP: [[BASE2:%[a-zA-Z_0-9-]+]] = inttoptr i64 [[SEXTADD]] to i32*
+; CHECK-GEP: [[BCC2:%[a-zA-Z_0-9-]+]] = bitcast i32* [[BASE2]] to i8*
+; CHECK-GEP: [[FULL2:%[a-zA-Z_0-9-]+]] = getelementptr i8* [[BCC2]], i64 48
+; CHECK-GEP: [[ADDR2:%[a-zA-Z_0-9-]+]] = bitcast i8* [[FULL2]] to i32*
+; CHECK-GEP: load i32* [[ADDR2]]
+; CHECK-GEP: ret
 define i32 @checkProfitability(i32 %arg1, i32 %arg2, i1 %test) {
   %shl = shl nsw i32 %arg1, 1
   %add1 = add nsw i32 %shl, %arg2
diff --git a/test/CodeGen/X86/codegen-prepare-crash.ll b/test/CodeGen/X86/codegen-prepare-crash.ll
new file mode 100644
index 0000000..c328817
--- /dev/null
+++ b/test/CodeGen/X86/codegen-prepare-crash.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s
+target triple = "x86_64-unknown-linux-gnu"
+
+@g = external global [10 x i32]
+
+define void @f(i32 %u) {
+  %1 = add i32 %u, 4
+  br label %P.Proc8.exit
+
+P.Proc8.exit:
+  %valueindex35.i = getelementptr [10 x i32]* @g, i32 0, i32 %1
+  store i32 %u, i32* %valueindex35.i
+  ret void
+}
diff --git a/test/CodeGen/X86/codegen-prepare.ll b/test/CodeGen/X86/codegen-prepare.ll
index 316accf..4ff0f1c 100644
--- a/test/CodeGen/X86/codegen-prepare.ll
+++ b/test/CodeGen/X86/codegen-prepare.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-pc-linux | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -addr-sink-using-gep=1 | FileCheck %s
 
 ; Check that the CodeGenPrepare Pass
 ; does not wrongly rewrite the address computed by Instruction %4
diff --git a/test/CodeGen/X86/combine-avx-intrinsics.ll b/test/CodeGen/X86/combine-avx-intrinsics.ll
new file mode 100644
index 0000000..f610f7f
--- /dev/null
+++ b/test/CodeGen/X86/combine-avx-intrinsics.ll
@@ -0,0 +1,119 @@
+; RUN: llc < %s -march=x86-64 -mcpu=corei7-avx | FileCheck %s
+
+
+define <4 x double> @test_x86_avx_blend_pd_256(<4 x double> %a0) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a0, i32 7)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test_x86_avx_blend_ps_256(<8 x float> %a0) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a0, i32 7)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a0, <4 x double> %a1)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a0, <8 x float> %a1)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+define <4 x double> @test2_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 0)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test2_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test2_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 0)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test2_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test2_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> zeroinitializer)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test2_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test2_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> zeroinitializer)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test2_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+define <4 x double> @test3_x86_avx_blend_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %1 = call <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double> %a0, <4 x double> %a1, i32 -1)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test3_x86_avx_blend_pd_256
+; CHECK-NOT: vblendpd
+; CHECK: ret
+
+
+define <8 x float> @test3_x86_avx_blend_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %1 = call <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float> %a0, <8 x float> %a1, i32 -1)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test3_x86_avx_blend_ps_256
+; CHECK-NOT: vblendps
+; CHECK: ret
+
+
+define <4 x double> @test3_x86_avx_blendv_pd_256(<4 x double> %a0, <4 x double> %a1) {
+  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <4 x double>
+  %1 = call <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double> %a0, <4 x double> %a1, <4 x double> %Mask)
+  ret <4 x double> %1
+}
+; CHECK-LABEL: test3_x86_avx_blendv_pd_256
+; CHECK-NOT: vblendvpd
+; CHECK: ret
+
+
+define <8 x float> @test3_x86_avx_blendv_ps_256(<8 x float> %a0, <8 x float> %a1) {
+  %Mask = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <8 x float>
+  %1 = call <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float> %a0, <8 x float> %a1, <8 x float> %Mask)
+  ret <8 x float> %1
+}
+; CHECK-LABEL: test3_x86_avx_blendv_ps_256
+; CHECK-NOT: vblendvps
+; CHECK: ret
+
+
+
+declare <4 x double> @llvm.x86.avx.blend.pd.256(<4 x double>, <4 x double>, i32)
+declare <8 x float> @llvm.x86.avx.blend.ps.256(<8 x float>, <8 x float>, i32)
+declare <4 x double> @llvm.x86.avx.blendv.pd.256(<4 x double>, <4 x double>, <4 x double>)
+declare <8 x float> @llvm.x86.avx.blendv.ps.256(<8 x float>, <8 x float>, <8 x float>)
+
diff --git a/test/CodeGen/X86/combine-avx2-intrinsics.ll b/test/CodeGen/X86/combine-avx2-intrinsics.ll
new file mode 100644
index 0000000..8794f8b
--- /dev/null
+++ b/test/CodeGen/X86/combine-avx2-intrinsics.ll
@@ -0,0 +1,164 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core-avx2 | FileCheck %s
+
+; Verify that the backend correctly combines AVX2 builtin intrinsics.
+
+
+define <8 x i32> @test_psra_1(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 3)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 2)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: vpsrad $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_2(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 3)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 2)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: vpsraw $8, %ymm0, %ymm0
+; CHECK-NEXT: ret
+
+define <16 x i16> @test_psra_3(<16 x i16> %A) {
+  %1 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %A, i32 0)
+  %2 = tail call <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16> %2, i32 0)
+  ret <16 x i16> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: vpsraw
+; CHECK: ret
+
+define <8 x i32> @test_psra_4(<8 x i32> %A) {
+  %1 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %A, i32 0)
+  %2 = tail call <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32> %2, i32 0)
+  ret <8 x i32> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: vpsrad
+; CHECK: ret
+
+
+define <32 x i8> @test_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a0, <32 x i8> %a1)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test_x86_avx2_pblendw(<16 x i16> %a0) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a0, i32 7)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test_x86_avx2_pblendd_128(<4 x i32> %a0) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a0, i32 7)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test_x86_avx2_pblendd_256(<8 x i32> %a0) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a0, i32 7)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <32 x i8> @test2_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> zeroinitializer)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test2_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 0)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test2_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 0)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test2_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 0)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test2_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <32 x i8> @test3_x86_avx2_pblendvb(<32 x i8> %a0, <32 x i8> %a1) {
+  %1 = bitcast <4 x i64> <i64 -1, i64 -1, i64 -1, i64 -1> to <32 x i8>
+  %res = call <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8> %a0, <32 x i8> %a1, <32 x i8> %1)
+  ret <32 x i8> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendvb
+; CHECK-NOT: vpblendvb
+; CHECK: ret
+
+
+define <16 x i16> @test3_x86_avx2_pblendw(<16 x i16> %a0, <16 x i16> %a1) {
+  %res = call <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16> %a0, <16 x i16> %a1, i32 -1)
+  ret <16 x i16> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendw
+; CHECK-NOT: vpblendw
+; CHECK: ret
+
+
+define <4 x i32> @test3_x86_avx2_pblendd_128(<4 x i32> %a0, <4 x i32> %a1) {
+  %res = call <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32> %a0, <4 x i32> %a1, i32 -1)
+  ret <4 x i32> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendd_128
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+define <8 x i32> @test3_x86_avx2_pblendd_256(<8 x i32> %a0, <8 x i32> %a1) {
+  %res = call <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32> %a0, <8 x i32> %a1, i32 -1)
+  ret <8 x i32> %res
+}
+; CHECK-LABEL: test3_x86_avx2_pblendd_256
+; CHECK-NOT: vpblendd
+; CHECK: ret
+
+
+declare <32 x i8> @llvm.x86.avx2.pblendvb(<32 x i8>, <32 x i8>, <32 x i8>)
+declare <16 x i16> @llvm.x86.avx2.pblendw(<16 x i16>, <16 x i16>, i32)
+declare <4 x i32> @llvm.x86.avx2.pblendd.128(<4 x i32>, <4 x i32>, i32)
+declare <8 x i32> @llvm.x86.avx2.pblendd.256(<8 x i32>, <8 x i32>, i32)
+declare <16 x i16> @llvm.x86.avx2.psra.w(<16 x i16>, <8 x i16>)
+declare <16 x i16> @llvm.x86.avx2.psrai.w(<16 x i16>, i32)
+declare <8 x i32> @llvm.x86.avx2.psra.d(<8 x i32>, <4 x i32>)
+declare <8 x i32> @llvm.x86.avx2.psrai.d(<8 x i32>, i32)
+
diff --git a/test/CodeGen/X86/combine-sse2-intrinsics.ll b/test/CodeGen/X86/combine-sse2-intrinsics.ll
new file mode 100644
index 0000000..fa500e5
--- /dev/null
+++ b/test/CodeGen/X86/combine-sse2-intrinsics.ll
@@ -0,0 +1,53 @@
+; RUN: llc < %s -march=x86 -mcpu=core2 | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
+
+; Verify that the backend correctly combines SSE2 builtin intrinsics.
+
+
+define <4 x i32> @test_psra_1(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 3)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 3, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 2)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_1
+; CHECK: psrad $8, %xmm0
+; CHECK-NEXT: ret
+
+define <8 x i16> @test_psra_2(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 3)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 3, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 2)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_2
+; CHECK: psraw $8, %xmm0
+; CHECK-NEXT: ret
+
+define <4 x i32> @test_psra_3(<4 x i32> %A) {
+  %1 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %A, i32 0)
+  %2 = tail call <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32> %1, <4 x i32> <i32 0, i32 0, i32 7, i32 0>)
+  %3 = tail call <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32> %2, i32 0)
+  ret <4 x i32> %3
+}
+; CHECK-LABEL: test_psra_3
+; CHECK-NOT: psrad
+; CHECK: ret
+
+
+define <8 x i16> @test_psra_4(<8 x i16> %A) {
+  %1 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %A, i32 0)
+  %2 = tail call <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16> %1, <8 x i16> <i16 0, i16 0, i16 0, i16 0, i16 7, i16 0, i16 0, i16 0>)
+  %3 = tail call <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16> %2, i32 0)
+  ret <8 x i16> %3
+}
+; CHECK-LABEL: test_psra_4
+; CHECK-NOT: psraw
+; CHECK: ret
+
+
+declare <8 x i16> @llvm.x86.sse2.psra.w(<8 x i16>, <8 x i16>)
+declare <8 x i16> @llvm.x86.sse2.psrai.w(<8 x i16>, i32)
+declare <4 x i32> @llvm.x86.sse2.psra.d(<4 x i32>, <4 x i32>)
+declare <4 x i32> @llvm.x86.sse2.psrai.d(<4 x i32>, i32)
+
diff --git a/test/CodeGen/X86/combine-sse41-intrinsics.ll b/test/CodeGen/X86/combine-sse41-intrinsics.ll
new file mode 100644
index 0000000..254991a
--- /dev/null
+++ b/test/CodeGen/X86/combine-sse41-intrinsics.ll
@@ -0,0 +1,182 @@
+; RUN: llc < %s -mtriple=x86_64-unknown -mcpu=corei7 | FileCheck %s
+
+
+define <2 x double> @test_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 0)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: ret
+
+
+define <4 x float> @test_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 0)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: ret
+
+
+define <2 x double> @test_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> zeroinitializer)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: ret
+
+
+define <4 x float> @test_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> zeroinitializer)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: ret
+
+
+define <16 x i8> @test_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> zeroinitializer)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: ret
+
+
+define <8 x i16> @test_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 0)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: ret
+
+
+define <2 x double> @test2_x86_sse41_blend_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a1, i32 -1)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_x86_sse41_blend_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a1, i32 -1)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <2 x double> @test2_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <2 x double>
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a1, <2 x double> %Mask )
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <4 x float> @test2_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <4 x float>
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a1, <4 x float> %Mask)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test2_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <16 x i8> @test2_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %a2) {
+  %Mask = bitcast <2 x i64> <i64 -1, i64 -1> to <16 x i8>
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a1, <16 x i8> %Mask)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test2_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <8 x i16> @test2_x86_sse41_pblend_w(<8 x i16> %a0, <8 x i16> %a1) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a1, i32 -1)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test2_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: movaps %xmm1, %xmm0
+; CHECK-NEXT: ret
+
+
+define <2 x double> @test3_x86_sse41_blend_pd(<2 x double> %a0) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendpd(<2 x double> %a0, <2 x double> %a0, i32 7)
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blend_pd
+; CHECK-NOT: blendpd
+; CHECK: ret
+
+
+define <4 x float> @test3_x86_sse41_blend_ps(<4 x float> %a0) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendps(<4 x float> %a0, <4 x float> %a0, i32 7)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blend_ps
+; CHECK-NOT: blendps
+; CHECK: ret
+
+
+define <2 x double> @test3_x86_sse41_blendv_pd(<2 x double> %a0, <2 x double> %a1) {
+  %1 = call <2 x double> @llvm.x86.sse41.blendvpd(<2 x double> %a0, <2 x double> %a0, <2 x double> %a1 )
+  ret <2 x double> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blendv_pd
+; CHECK-NOT: blendvpd
+; CHECK: ret
+
+
+define <4 x float> @test3_x86_sse41_blendv_ps(<4 x float> %a0, <4 x float> %a1) {
+  %1 = call <4 x float> @llvm.x86.sse41.blendvps(<4 x float> %a0, <4 x float> %a0, <4 x float> %a1)
+  ret <4 x float> %1
+}
+; CHECK-LABEL: test3_x86_sse41_blendv_ps
+; CHECK-NOT: blendvps
+; CHECK: ret
+
+
+define <16 x i8> @test3_x86_sse41_pblendv_b(<16 x i8> %a0, <16 x i8> %a1) {
+  %1 = call <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8> %a0, <16 x i8> %a0, <16 x i8> %a1)
+  ret <16 x i8> %1
+}
+; CHECK-LABEL: test3_x86_sse41_pblendv_b
+; CHECK-NOT: pblendvb
+; CHECK: ret
+
+
+define <8 x i16> @test3_x86_sse41_pblend_w(<8 x i16> %a0) {
+  %1 = call <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16> %a0, <8 x i16> %a0, i32 7)
+  ret <8 x i16> %1
+}
+; CHECK-LABEL: test3_x86_sse41_pblend_w
+; CHECK-NOT: pblendw
+; CHECK: ret
+
+
+declare <2 x double> @llvm.x86.sse41.blendpd(<2 x double>, <2 x double>, i32)
+declare <4 x float> @llvm.x86.sse41.blendps(<4 x float>, <4 x float>, i32)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <8 x i16> @llvm.x86.sse41.pblendw(<8 x i16>, <8 x i16>, i32)
+declare <8 x i16> @llvm.x86.sse41.phminposuw(<8 x i16>)
+
diff --git a/test/CodeGen/X86/constant-hoisting-shift-immediate.ll b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
new file mode 100644
index 0000000..883be35
--- /dev/null
+++ b/test/CodeGen/X86/constant-hoisting-shift-immediate.ll
@@ -0,0 +1,25 @@
+; RUN: llc < %s -O3 -march=x86-64 |FileCheck %s
+define i64 @foo(i1 %z, i192* %p, i192* %q)
+{
+; If const 128 is hoisted to a variable, then in basic block L_val2 we would
+; have %lshr2 = lshr i192 %data2, %const, and the definition of %const would
+; be in another basic block. As a result, a very inefficient code might be
+; produced. Here we check that this doesn't occur.
+entry:
+  %data1 = load i192* %p, align 8
+  %lshr1 = lshr i192 %data1, 128
+  %val1  = trunc i192 %lshr1 to i64
+  br i1 %z, label %End, label %L_val2
+
+; CHECK: movq    16(%rdx), %rax
+; CHECK-NEXT: retq
+L_val2:
+  %data2 = load i192* %q, align 8
+  %lshr2 = lshr i192 %data2, 128
+  %val2  = trunc i192 %lshr2 to i64
+  br label %End
+
+End:
+  %p1 = phi i64 [%val1,%entry], [%val2,%L_val2]
+  ret i64 %p1
+}
diff --git a/test/CodeGen/X86/divide-by-constant.ll b/test/CodeGen/X86/divide-by-constant.ll
index 98ae1d5..21225e3 100644
--- a/test/CodeGen/X86/divide-by-constant.ll
+++ b/test/CodeGen/X86/divide-by-constant.ll
@@ -7,7 +7,7 @@ entry:
 	%div = udiv i16 %x, 33
 	ret i16 %div
 ; CHECK-LABEL: test1:
-; CHECK: imull	$63551, %eax, %eax
+; CHECK: imull	$63551, %eax
 ; CHECK-NEXT: shrl	$21, %eax
 ; CHECK-NEXT: ret
 }
@@ -18,7 +18,7 @@ entry:
   ret i16 %div
 
 ; CHECK-LABEL: test2:
-; CHECK: imull	$43691, %eax, %eax
+; CHECK: imull	$43691, %eax
 ; CHECK-NEXT: shrl	$17, %eax
 ; CHECK-NEXT: ret
 }
@@ -30,7 +30,7 @@ entry:
 
 ; CHECK-LABEL: test3:
 ; CHECK: movzbl  8(%esp), %eax
-; CHECK-NEXT: imull	$171, %eax, %eax
+; CHECK-NEXT: imull	$171, %eax
 ; CHECK-NEXT: shrl	$9, %eax
 ; CHECK-NEXT: ret
 }
@@ -40,7 +40,7 @@ entry:
 	%div = sdiv i16 %x, 33		; <i32> [#uses=1]
 	ret i16 %div
 ; CHECK-LABEL: test4:
-; CHECK: imull	$1986, %eax, %
+; CHECK: imull	$1986, %eax
 }
 
 define i32 @test5(i32 %A) nounwind {
diff --git a/test/CodeGen/X86/dllexport-x86_64.ll b/test/CodeGen/X86/dllexport-x86_64.ll
index a38c2d8..f4dec4f 100644
--- a/test/CodeGen/X86/dllexport-x86_64.ll
+++ b/test/CodeGen/X86/dllexport-x86_64.ll
@@ -40,18 +40,18 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"r"
+; CHECK: .rdata,"rd"
 ; CHECK: .globl Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"w",discard,WeakVar1
+; CHECK: .section .data,"wd",discard,WeakVar1
 ; CHECK: .globl WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"r",discard,WeakVar2
+; CHECK: .section .rdata,"rd",discard,WeakVar2
 ; CHECK: .globl WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -66,39 +66,43 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl alias3
 ; CHECK: alias3 = notExported
-@alias3 = dllexport alias void()* @alias
+@alias3 = dllexport alias void()* @notExported
 
 ; CHECK: .weak weak_alias
 ; CHECK: weak_alias = f1
 @weak_alias = dllexport alias weak_odr void()* @f1
 
+@blob = global [6 x i8] c"\B8*\00\00\00\C3", section ".text", align 16
+@blob_alias = dllexport alias i32 (), [6 x i8]* @blob
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:Var1,DATA
-; WIN32: /EXPORT:Var2,DATA
-; WIN32: /EXPORT:Var3,DATA
-; WIN32: /EXPORT:WeakVar1,DATA
-; WIN32: /EXPORT:WeakVar2,DATA
-; WIN32: /EXPORT:f1
-; WIN32: /EXPORT:f2
-; WIN32: /EXPORT:lnk1
-; WIN32: /EXPORT:lnk2
-; WIN32: /EXPORT:weak1
-; WIN32: /EXPORT:alias
-; WIN32: /EXPORT:alias2
-; WIN32: /EXPORT:alias3
-; WIN32: /EXPORT:weak_alias
-; MINGW: -export:Var1,data
-; MINGW: -export:Var2,data
-; MINGW: -export:Var3,data
-; MINGW: -export:WeakVar1,data
-; MINGW: -export:WeakVar2,data
-; MINGW: -export:f1
-; MINGW: -export:f2
-; MINGW: -export:lnk1
-; MINGW: -export:lnk2
-; MINGW: -export:weak1
-; MINGW: -export:alias
-; MINGW: -export:alias2
-; MINGW: -export:alias3
-; MINGW: -export:weak_alias
+; WIN32: " /EXPORT:Var1,DATA"
+; WIN32: " /EXPORT:Var2,DATA"
+; WIN32: " /EXPORT:Var3,DATA"
+; WIN32: " /EXPORT:WeakVar1,DATA"
+; WIN32: " /EXPORT:WeakVar2,DATA"
+; WIN32: " /EXPORT:f1"
+; WIN32: " /EXPORT:f2"
+; WIN32: " /EXPORT:lnk1"
+; WIN32: " /EXPORT:lnk2"
+; WIN32: " /EXPORT:weak1"
+; WIN32: " /EXPORT:alias"
+; WIN32: " /EXPORT:alias2"
+; WIN32: " /EXPORT:alias3"
+; WIN32: " /EXPORT:weak_alias"
+; WIN32: " /EXPORT:blob_alias"
+; MINGW: " -export:Var1,data"
+; MINGW: " -export:Var2,data"
+; MINGW: " -export:Var3,data"
+; MINGW: " -export:WeakVar1,data"
+; MINGW: " -export:WeakVar2,data"
+; MINGW: " -export:f1"
+; MINGW: " -export:f2"
+; MINGW: " -export:lnk1"
+; MINGW: " -export:lnk2"
+; MINGW: " -export:weak1"
+; MINGW: " -export:alias"
+; MINGW: " -export:alias2"
+; MINGW: " -export:alias3"
+; MINGW: " -export:weak_alias"
+; MINGW: " -export:blob_alias"
diff --git a/test/CodeGen/X86/dllexport.ll b/test/CodeGen/X86/dllexport.ll
index 1b34d23..e2c3f13 100644
--- a/test/CodeGen/X86/dllexport.ll
+++ b/test/CodeGen/X86/dllexport.ll
@@ -1,5 +1,9 @@
-; RUN: llc -mtriple i386-pc-win32 < %s | FileCheck -check-prefix=CHECK -check-prefix=WIN32 %s
-; RUN: llc -mtriple i386-pc-mingw32 < %s | FileCheck -check-prefix=CHECK -check-prefix=MINGW %s
+; RUN: llc -mtriple i386-pc-win32 < %s \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-CL %s
+; RUN: llc -mtriple i386-pc-mingw32 < %s \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
+; RUN: llc -mtriple i686-pc-cygwin %s -o - \
+; RUN:    | FileCheck -check-prefix CHECK -check-prefix CHECK-GCC %s
 
 ; CHECK: .text
 
@@ -55,18 +59,18 @@ define weak_odr dllexport void @weak1() {
 ; CHECK: .globl _Var1
 @Var1 = dllexport global i32 1, align 4
 
-; CHECK: .rdata,"r"
+; CHECK: .rdata,"rd"
 ; CHECK: .globl _Var2
 @Var2 = dllexport unnamed_addr constant i32 1
 
 ; CHECK: .comm _Var3
 @Var3 = common dllexport global i32 0, align 4
 
-; CHECK: .section .data,"w",discard,_WeakVar1
+; CHECK: .section .data,"wd",discard,_WeakVar1
 ; CHECK: .globl _WeakVar1
 @WeakVar1 = weak_odr dllexport global i32 1, align 4
 
-; CHECK: .section .rdata,"r",discard,_WeakVar2
+; CHECK: .section .rdata,"rd",discard,_WeakVar2
 ; CHECK: .globl _WeakVar2
 @WeakVar2 = weak_odr dllexport unnamed_addr constant i32 1
 
@@ -81,7 +85,7 @@ define weak_odr dllexport void @weak1() {
 
 ; CHECK: .globl _alias3
 ; CHECK: _alias3 = _notExported
-@alias3 = dllexport alias void()* @alias
+@alias3 = dllexport alias void()* @notExported
 
 ; CHECK: .weak _weak_alias
 ; CHECK: _weak_alias = _f1
@@ -89,37 +93,38 @@ define weak_odr dllexport void @weak1() {
 
 
 ; CHECK: .section .drectve
-; WIN32: /EXPORT:_Var1,DATA
-; WIN32: /EXPORT:_Var2,DATA
-; WIN32: /EXPORT:_Var3,DATA
-; WIN32: /EXPORT:_WeakVar1,DATA
-; WIN32: /EXPORT:_WeakVar2,DATA
-; WIN32: /EXPORT:_f1
-; WIN32: /EXPORT:_f2
-; WIN32: /EXPORT:_stdfun@0
-; WIN32: /EXPORT:@fastfun@0
-; WIN32: /EXPORT:_thisfun
-; WIN32: /EXPORT:_lnk1
-; WIN32: /EXPORT:_lnk2
-; WIN32: /EXPORT:_weak1
-; WIN32: /EXPORT:_alias
-; WIN32: /EXPORT:_alias2
-; WIN32: /EXPORT:_alias3
-; WIN32: /EXPORT:_weak_alias
-; MINGW: -export:_Var1,data
-; MINGW: -export:_Var2,data
-; MINGW: -export:_Var3,data
-; MINGW: -export:_WeakVar1,data
-; MINGW: -export:_WeakVar2,data
-; MINGW: -export:_f1
-; MINGW: -export:_f2
-; MINGW: -export:_stdfun@0
-; MINGW: -export:@fastfun@0
-; MINGW: -export:_thisfun
-; MINGW: -export:_lnk1
-; MINGW: -export:_lnk2
-; MINGW: -export:_weak1
-; MINGW: -export:_alias
-; MINGW: -export:_alias2
-; MINGW: -export:_alias3
-; MINGW: -export:_weak_alias
+; CHECK-CL: " /EXPORT:_Var1,DATA"
+; CHECK-CL: " /EXPORT:_Var2,DATA"
+; CHECK-CL: " /EXPORT:_Var3,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar1,DATA"
+; CHECK-CL: " /EXPORT:_WeakVar2,DATA"
+; CHECK-CL: " /EXPORT:_f1"
+; CHECK-CL: " /EXPORT:_f2"
+; CHECK-CL: " /EXPORT:_stdfun@0"
+; CHECK-CL: " /EXPORT:@fastfun@0"
+; CHECK-CL: " /EXPORT:_thisfun"
+; CHECK-CL: " /EXPORT:_lnk1"
+; CHECK-CL: " /EXPORT:_lnk2"
+; CHECK-CL: " /EXPORT:_weak1"
+; CHECK-CL: " /EXPORT:_alias"
+; CHECK-CL: " /EXPORT:_alias2"
+; CHECK-CL: " /EXPORT:_alias3"
+; CHECK-CL: " /EXPORT:_weak_alias"
+; CHECK-GCC: " -export:Var1,data"
+; CHECK-GCC: " -export:Var2,data"
+; CHECK-GCC: " -export:Var3,data"
+; CHECK-GCC: " -export:WeakVar1,data"
+; CHECK-GCC: " -export:WeakVar2,data"
+; CHECK-GCC: " -export:f1"
+; CHECK-GCC: " -export:f2"
+; CHECK-GCC: " -export:stdfun@0"
+; CHECK-GCC: " -export:@fastfun@0"
+; CHECK-GCC: " -export:thisfun"
+; CHECK-GCC: " -export:lnk1"
+; CHECK-GCC: " -export:lnk2"
+; CHECK-GCC: " -export:weak1"
+; CHECK-GCC: " -export:alias"
+; CHECK-GCC: " -export:alias2"
+; CHECK-GCC: " -export:alias3"
+; CHECK-GCC: " -export:weak_alias"
+
diff --git a/test/CodeGen/X86/expand-opaque-const.ll b/test/CodeGen/X86/expand-opaque-const.ll
new file mode 100644
index 0000000..6e461cf
--- /dev/null
+++ b/test/CodeGen/X86/expand-opaque-const.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mcpu=generic -O1 -relocation-model=pic < %s | FileCheck %s
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i686-apple-darwin"
+
+define i64 @test_lshr() {
+entry:
+; CHECK-NOT: movl $-1, 16(%esp)
+; CHECK-NOT: movl  $-1, %eax
+  %retval = alloca i64
+  %op1 = alloca i64
+  %op2 = alloca i64
+  store i64 -6687208052682386272, i64* %op1
+  store i64 7106745059734980448, i64* %op2
+  %tmp1 = load i64* %op1
+  %tmp2 = load i64* %op2
+  %tmp = xor i64 %tmp2, 7106745059734980448
+  %tmp3 = lshr i64 %tmp1, %tmp
+  store i64 %tmp3, i64* %retval
+  %tmp4 = load i64* %retval
+  ret i64 %tmp4
+}
diff --git a/test/CodeGen/X86/f16c-intrinsics.ll b/test/CodeGen/X86/f16c-intrinsics.ll
index 2135f94..514d929 100644
--- a/test/CodeGen/X86/f16c-intrinsics.ll
+++ b/test/CodeGen/X86/f16c-intrinsics.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 -mattr=+avx,+f16c | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+avx,+f16c | FileCheck %s
 
 define <4 x float> @test_x86_vcvtph2ps_128(<8 x i16> %a0) {
   ; CHECK: vcvtph2ps
@@ -30,3 +31,16 @@ define <8 x i16> @test_x86_vcvtps2ph_256(<8 x float> %a0) {
   ret <8 x i16> %res
 }
 declare <8 x i16> @llvm.x86.vcvtps2ph.256(<8 x float>, i32) nounwind readonly
+
+define <4 x float> @test_x86_vcvtps2ph_128_scalar(i64* %ptr) {
+; CHECK-LABEL: test_x86_vcvtps2ph_128_scalar
+; CHECK-NOT: vmov
+; CHECK: vcvtph2ps (%
+
+  %load = load i64* %ptr
+  %ins1 = insertelement <2 x i64> undef, i64 %load, i32 0
+  %ins2 = insertelement <2 x i64> %ins1, i64 0, i32 1
+  %bc = bitcast <2 x i64> %ins2 to <8 x i16>
+  %res = tail call <4 x float> @llvm.x86.vcvtph2ps.128(<8 x i16> %bc) #2
+  ret <4 x float> %res
+}
diff --git a/test/CodeGen/X86/fma-do-not-commute.ll b/test/CodeGen/X86/fma-do-not-commute.ll
new file mode 100644
index 0000000..4e21172
--- /dev/null
+++ b/test/CodeGen/X86/fma-do-not-commute.ll
@@ -0,0 +1,30 @@
+; RUN: llc -fp-contract=fast -mattr=+fma -disable-cgp < %s -o - | FileCheck %s
+; Check that the 2nd and 3rd arguments of fmaXXX231 reg1, reg2, mem3 are not commuted.
+; <rdar://problem/16800495> 
+target datalayout = "e-m:o-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx"
+
+; CHECK-LABEL: test1:
+; %arg lives in xmm0 and it shouldn't be redefined until it is used in the FMA.
+; CHECK-NOT {{.*}}, %xmm0
+; %addr lives in rdi.
+; %addr2 lives in rsi.
+; CHECK: vmovss (%rsi), [[ADDR2:%xmm[0-9]+]]
+; The assembly syntax is in the reverse order.
+; CHECK: vfmadd231ss (%rdi), [[ADDR2]], %xmm0
+define void @test1(float* %addr, float* %addr2, float %arg) {
+entry:
+  br label %loop
+
+loop:
+  %sum0 = phi float [ %fma, %loop ], [ %arg, %entry ]
+  %addrVal = load float* %addr, align 4
+  %addr2Val = load float* %addr2, align 4
+  %fmul = fmul float %addrVal, %addr2Val
+  %fma = fadd float %sum0, %fmul
+  br i1 true, label %exit, label %loop
+
+exit:
+  store float %fma, float* %addr, align 4
+  ret void
+}
diff --git a/test/CodeGen/X86/fold-load-vec.ll b/test/CodeGen/X86/fold-load-vec.ll
index e85d8f7..96c5be4 100644
--- a/test/CodeGen/X86/fold-load-vec.ll
+++ b/test/CodeGen/X86/fold-load-vec.ll
@@ -5,7 +5,7 @@
 ; loads from m32.
 define void @sample_test(<4 x float>* %source, <2 x float>* %dest) nounwind {
 ; CHECK: sample_test
-; CHECK: movaps
+; CHECK-NOT: movaps
 ; CHECK: insertps
 entry:
   %source.addr = alloca <4 x float>*, align 8
diff --git a/test/CodeGen/X86/gcc_except_table.ll b/test/CodeGen/X86/gcc_except_table.ll
index 7a29b07..8c328ec 100644
--- a/test/CodeGen/X86/gcc_except_table.ll
+++ b/test/CodeGen/X86/gcc_except_table.ll
@@ -50,7 +50,3 @@ eh.resume:
 declare void @_Z1fv() optsize
 
 declare i32 @__gxx_personality_v0(...)
-
-; CHECK: Leh_func_end0:
-; CHECK: GCC_except_table0
-; CHECK: = Leh_func_end0-
diff --git a/test/CodeGen/X86/global-sections.ll b/test/CodeGen/X86/global-sections.ll
index 5ad5047..c763f39 100644
--- a/test/CodeGen/X86/global-sections.ll
+++ b/test/CodeGen/X86/global-sections.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin9.7 | FileCheck %s -check-prefix=DARWIN
 ; RUN: llc < %s -mtriple=i386-apple-darwin10 -relocation-model=static | FileCheck %s -check-prefix=DARWIN-STATIC
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin10 | FileCheck %s -check-prefix=DARWIN64
-; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -fdata-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
-; RUN: llc < %s -mtriple=i686-pc-win32 -fdata-sections -ffunction-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
+; RUN: llc < %s -mtriple=i386-unknown-linux-gnu -data-sections | FileCheck %s -check-prefix=LINUX-SECTIONS
+; RUN: llc < %s -mtriple=i686-pc-win32 -data-sections -function-sections | FileCheck %s -check-prefix=WIN32-SECTIONS
 
 define void @F1() {
   ret void
@@ -18,13 +18,13 @@ define void @F1() {
 ; LINUX: .type   G1,@object
 ; LINUX: .comm  G1,4,4
 
-; DARWIN: .comm	_G1,4,2
+; DARWIN: .comm _G1,4,2
 
 
 
 
 ; const int G2 __attribute__((weak)) = 42;
-@G2 = weak_odr unnamed_addr constant i32 42	
+@G2 = weak_odr unnamed_addr constant i32 42     
 
 
 ; TODO: linux drops this into .rodata, we drop it into ".gnu.linkonce.r.G2"
@@ -48,7 +48,7 @@ define void @F1() {
 ; LINUX-SECTIONS: .section        .rodata.G3,"a",@progbits
 ; LINUX-SECTIONS: .globl  G3
 
-; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G3
+; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G3
 ; WIN32-SECTIONS: .globl  _G3
 
 
@@ -85,25 +85,25 @@ define void @F1() {
 ; PR4584
 @"foo bar" = linkonce global i32 42
 
-; LINUX: .type	"foo bar",@object
+; LINUX: .type  "foo bar",@object
 ; LINUX: .section ".data.foo bar","aGw",@progbits,"foo bar",comdat
-; LINUX: .weak	"foo bar"
+; LINUX: .weak  "foo bar"
 ; LINUX: "foo bar":
 
-; DARWIN: .section		__DATA,__datacoal_nt,coalesced
-; DARWIN: .globl	"_foo bar"
-; DARWIN:	.weak_definition "_foo bar"
+; DARWIN: .section              __DATA,__datacoal_nt,coalesced
+; DARWIN: .globl        "_foo bar"
+; DARWIN:       .weak_definition "_foo bar"
 ; DARWIN: "_foo bar":
 
 ; PR4650
 @G6 = weak_odr unnamed_addr constant [1 x i8] c"\01"
 
-; LINUX:   .type	G6,@object
-; LINUX:   .section	.rodata.G6,"aG",@progbits,G6,comdat
-; LINUX:   .weak	G6
+; LINUX:   .type        G6,@object
+; LINUX:   .section     .rodata.G6,"aG",@progbits,G6,comdat
+; LINUX:   .weak        G6
 ; LINUX: G6:
-; LINUX:   .byte	1
-; LINUX:   .size	G6, 1
+; LINUX:   .byte        1
+; LINUX:   .size        G6, 1
 
 ; DARWIN:  .section __TEXT,__const_coal,coalesced
 ; DARWIN:  .globl _G6
@@ -114,58 +114,58 @@ define void @F1() {
 
 @G7 = unnamed_addr constant [10 x i8] c"abcdefghi\00"
 
-; DARWIN:	__TEXT,__cstring,cstring_literals
-; DARWIN:	.globl _G7
+; DARWIN:       __TEXT,__cstring,cstring_literals
+; DARWIN:       .globl _G7
 ; DARWIN: _G7:
-; DARWIN:	.asciz	"abcdefghi"
+; DARWIN:       .asciz  "abcdefghi"
 
-; LINUX:	.section	.rodata.str1.1,"aMS",@progbits,1
-; LINUX:	.globl G7
+; LINUX:        .section        .rodata.str1.1,"aMS",@progbits,1
+; LINUX:        .globl G7
 ; LINUX: G7:
-; LINUX:	.asciz	"abcdefghi"
+; LINUX:        .asciz  "abcdefghi"
 
 ; LINUX-SECTIONS: .section        .rodata.G7,"aMS",@progbits,1
-; LINUX-SECTIONS:	.globl G7
+; LINUX-SECTIONS:       .globl G7
 
-; WIN32-SECTIONS: .section        .rdata,"r",one_only,_G7
-; WIN32-SECTIONS:	.globl _G7
+; WIN32-SECTIONS: .section        .rdata,"rd",one_only,_G7
+; WIN32-SECTIONS:       .globl _G7
 
 
 @G8 = unnamed_addr constant [4 x i16] [ i16 1, i16 2, i16 3, i16 0 ]
 
-; DARWIN:	.section	__TEXT,__const
-; DARWIN:	.globl _G8
+; DARWIN:       .section        __TEXT,__const
+; DARWIN:       .globl _G8
 ; DARWIN: _G8:
 
-; LINUX:	.section	.rodata.str2.2,"aMS",@progbits,2
-; LINUX:	.globl G8
+; LINUX:        .section        .rodata.str2.2,"aMS",@progbits,2
+; LINUX:        .globl G8
 ; LINUX:G8:
 
 @G9 = unnamed_addr constant [4 x i32] [ i32 1, i32 2, i32 3, i32 0 ]
 
-; DARWIN:	.globl _G9
+; DARWIN:       .globl _G9
 ; DARWIN: _G9:
 
-; LINUX:	.section	.rodata.str4.4,"aMS",@progbits,4
-; LINUX:	.globl G9
+; LINUX:        .section        .rodata.str4.4,"aMS",@progbits,4
+; LINUX:        .globl G9
 ; LINUX:G9
 
 
 @G10 = weak global [100 x i32] zeroinitializer, align 32 ; <[100 x i32]*> [#uses=0]
 
 
-; DARWIN: 	.section	__DATA,__datacoal_nt,coalesced
+; DARWIN:       .section        __DATA,__datacoal_nt,coalesced
 ; DARWIN: .globl _G10
-; DARWIN:	.weak_definition _G10
-; DARWIN:	.align	5
+; DARWIN:       .weak_definition _G10
+; DARWIN:       .align  5
 ; DARWIN: _G10:
-; DARWIN:	.space	400
+; DARWIN:       .space  400
 
-; LINUX:	.bss
-; LINUX:	.weak	G10
-; LINUX:	.align	32
+; LINUX:        .bss
+; LINUX:        .weak   G10
+; LINUX:        .align  32
 ; LINUX: G10:
-; LINUX:	.zero	400
+; LINUX:        .zero   400
 
 
 
@@ -190,7 +190,7 @@ define void @F1() {
 ; LINUX-SECTIONS:        .asciz  "foo"
 ; LINUX-SECTIONS:        .size   .LG14, 4
 
-; WIN32-SECTIONS:        .section        .rdata,"r"
+; WIN32-SECTIONS:        .section        .rdata,"rd"
 ; WIN32-SECTIONS: L_G14:
 ; WIN32-SECTIONS:        .asciz  "foo"
 
diff --git a/test/CodeGen/X86/indirect-hidden.ll b/test/CodeGen/X86/indirect-hidden.ll
new file mode 100644
index 0000000..309375d
--- /dev/null
+++ b/test/CodeGen/X86/indirect-hidden.ll
@@ -0,0 +1,43 @@
+; RUN: llc -mtriple=i686-apple-macosx -o - %s | FileCheck %s
+
+; x86 doesn't normally use indirect symbols, particularly hidden ones, but it
+; can be tricked into it for exception-handling typeids.
+
+@hidden_typeid = external hidden constant i8*
+@normal_typeid = external constant i8*
+
+declare void @throws()
+
+define void @get_indirect_hidden() {
+  invoke void @throws() to label %end unwind label %lpad
+lpad:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @hidden_typeid to i8*)
+  br label %end
+
+end:
+  ret void
+}
+
+define void @get_indirect() {
+  invoke void @throws() to label %end unwind label %lpad
+lpad:
+  %tmp = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*)
+          catch i8* bitcast (i8** @normal_typeid to i8*)
+  br label %end
+
+end:
+  ret void
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+; CHECK: .section __IMPORT,__pointers,non_lazy_symbol_pointers
+
+; CHECK-NOT: __DATA,__data
+; CHECK: .indirect_symbol _normal_typeid
+; CHECK-NEXT: .long 0
+
+; CHECK-NOT: __DATA,__data
+; CHECK: .indirect_symbol _hidden_typeid
+; CHECK-NEXT: .long 0
diff --git a/test/CodeGen/X86/isel-sink.ll b/test/CodeGen/X86/isel-sink.ll
index 458f19d..e4af9b6 100644
--- a/test/CodeGen/X86/isel-sink.ll
+++ b/test/CodeGen/X86/isel-sink.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -march=x86 | FileCheck %s
+; RUN: llc < %s -march=x86 -addr-sink-using-gep=1 | FileCheck %s
 
 define i32 @test(i32* %X, i32 %B) {
 ; CHECK-LABEL: test:
diff --git a/test/CodeGen/X86/lit.local.cfg b/test/CodeGen/X86/lit.local.cfg
index 1637fa4..3d91b03 100644
--- a/test/CodeGen/X86/lit.local.cfg
+++ b/test/CodeGen/X86/lit.local.cfg
@@ -4,7 +4,7 @@
 #
 # It should be possible to remove this override once all the bots have cycled
 # cleanly.
-config.suffixes = ['.ll', '.c', '.cpp', '.test', '.txt']
+config.suffixes = ['.ll', '.test', '.txt']
 
 targets = set(config.root.targets_to_build.split())
 if not 'X86' in targets:
diff --git a/test/CodeGen/X86/live-out-reg-info.ll b/test/CodeGen/X86/live-out-reg-info.ll
index 8cd9774..283ee3a 100644
--- a/test/CodeGen/X86/live-out-reg-info.ll
+++ b/test/CodeGen/X86/live-out-reg-info.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -march=x86-64 | grep testb
 
 ; Make sure dagcombine doesn't eliminate the comparison due
-; to an off-by-one bug with ComputeMaskedBits information.
+; to an off-by-one bug with computeKnownBits information.
 
 declare void @qux()
 
diff --git a/test/CodeGen/X86/lower-bitcast.ll b/test/CodeGen/X86/lower-bitcast.ll
new file mode 100644
index 0000000..b9b29a5
--- /dev/null
+++ b/test/CodeGen/X86/lower-bitcast.ll
@@ -0,0 +1,155 @@
+; RUN: llc < %s -march=x86-64 -mcpu=core2 -mattr=+sse2 | FileCheck %s
+
+
+define double @test1(double %A) {
+  %1 = bitcast double %A to <2 x i32>
+  %add = add <2 x i32> %1, <i32 3, i32 5>
+  %2 = bitcast <2 x i32> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test1 into a
+; single paddd instruction. At the moment we produce the sequence 
+; pshufd+paddq+pshufd.
+
+; CHECK-LABEL: test1
+; CHECK-NOT: movsd
+; CHECK: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define double @test2(double %A, double %B) {
+  %1 = bitcast double %A to <2 x i32>
+  %2 = bitcast double %B to <2 x i32>
+  %add = add <2 x i32> %1, %2
+  %3 = bitcast <2 x i32> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test2 into a
+; single 'paddd %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddq+pshufd.
+
+; CHECK-LABEL: test2
+; CHECK-NOT: movsd
+; CHECK: pshufd
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK-NEXT: ret
+
+
+define i64 @test3(i64 %A) {
+  %1 = bitcast i64 %A to <2 x float>
+  %add = fadd <2 x float> %1, <float 3.0, float 5.0>
+  %2 = bitcast <2 x float> %add to i64
+  ret i64 %2
+}
+; CHECK-LABEL: test3
+; CHECK-NOT: pshufd
+; CHECK: addps
+; CHECK-NOT: pshufd
+; CHECK: ret
+
+
+define i64 @test4(i64 %A) {
+  %1 = bitcast i64 %A to <2 x i32>
+  %add = add <2 x i32> %1, <i32 3, i32 5>
+  %2 = bitcast <2 x i32> %add to i64
+  ret i64 %2
+}
+; FIXME: At the moment we still produce the sequence pshufd+paddq+pshufd.
+; Ideally, we should fold that sequence into a single paddd.
+
+; CHECK-LABEL: test4
+; CHECK: pshufd
+; CHECK-NEXT: paddq
+; CHECK-NEXT: pshufd
+; CHECK: ret
+
+
+define double @test5(double %A) {
+  %1 = bitcast double %A to <2 x float>
+  %add = fadd <2 x float> %1, <float 3.0, float 5.0>
+  %2 = bitcast <2 x float> %add to double
+  ret double %2
+}
+; CHECK-LABEL: test5
+; CHECK: addps
+; CHECK-NEXT: ret
+
+
+define double @test6(double %A) {
+  %1 = bitcast double %A to <4 x i16>
+  %add = add <4 x i16> %1, <i16 3, i16 4, i16 5, i16 6>
+  %2 = bitcast <4 x i16> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test6 into a
+; single paddw instruction.
+
+; CHECK-LABEL: test6
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: paddd
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test7(double %A, double %B) {
+  %1 = bitcast double %A to <4 x i16>
+  %2 = bitcast double %B to <4 x i16>
+  %add = add <4 x i16> %1, %2
+  %3 = bitcast <4 x i16> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test7 into a
+; single 'paddw %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddd+pshufd.
+
+; CHECK-LABEL: test7
+; CHECK-NOT: movsd
+; CHECK: punpcklwd
+; CHECK-NEXT: punpcklwd
+; CHECK-NEXT: paddd
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test8(double %A) {
+  %1 = bitcast double %A to <8 x i8>
+  %add = add <8 x i8> %1, <i8 3, i8 4, i8 5, i8 6, i8 7, i8 8, i8 9, i8 10>
+  %2 = bitcast <8 x i8> %add to double
+  ret double %2
+}
+; FIXME: Ideally we should be able to fold the entire body of @test8 into a
+; single paddb instruction. At the moment we produce the sequence 
+; pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test8
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
+
+define double @test9(double %A, double %B) {
+  %1 = bitcast double %A to <8 x i8>
+  %2 = bitcast double %B to <8 x i8>
+  %add = add <8 x i8> %1, %2
+  %3 = bitcast <8 x i8> %add to double
+  ret double %3
+}
+; FIXME: Ideally we should be able to fold the entire body of @test9 into a
+; single 'paddb %xmm1, %xmm0' instruction. At the moment we produce the
+; sequence pshufd+pshufd+paddw+pshufd.
+
+; CHECK-LABEL: test9
+; CHECK-NOT: movsd
+; CHECK: punpcklbw
+; CHECK-NEXT: punpcklbw
+; CHECK-NEXT: paddw
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+
diff --git a/test/CodeGen/X86/lower-vec-shift.ll b/test/CodeGen/X86/lower-vec-shift.ll
new file mode 100644
index 0000000..c28f82a
--- /dev/null
+++ b/test/CodeGen/X86/lower-vec-shift.ll
@@ -0,0 +1,125 @@
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=corei7-avx | FileCheck %s --check-prefix=CHECK --check-prefix=AVX
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core2 | FileCheck %s --check-prefix=CHECK --check-prefix=SSE
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -mcpu=core-avx2 | FileCheck %s --check-prefix=CHECK --check-prefix=AVX2
+
+
+; Verify that the following shifts are lowered into a sequence of two shifts plus
+; a blend. On pre-avx2 targets, instead of scalarizing logical and arithmetic
+; packed shift right by a constant build_vector the backend should always try to
+; emit a simpler sequence of two shifts + blend when possible.
+
+define <8 x i16> @test1(<8 x i16> %a) {
+  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test1
+; SSE: psrlw
+; SSE-NEXT: psrlw
+; SSE-NEXT: movss
+; AVX: vpsrlw
+; AVX-NEXT: vpsrlw
+; AVX-NEXT: vmovss
+; AVX2: vpsrlw
+; AVX2-NEXT: vpsrlw
+; AVX2-NEXT: vmovss
+; CHECK: ret
+
+
+define <8 x i16> @test2(<8 x i16> %a) {
+  %lshr = lshr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test2
+; SSE: psrlw
+; SSE-NEXT: psrlw
+; SSE-NEXT: movsd
+; AVX: vpsrlw
+; AVX-NEXT: vpsrlw
+; AVX-NEXT: vmovsd
+; AVX2: vpsrlw
+; AVX2-NEXT: vpsrlw
+; AVX2-NEXT: vmovsd
+; CHECK: ret
+
+
+define <4 x i32> @test3(<4 x i32> %a) {
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test3
+; SSE: psrld
+; SSE-NEXT: psrld
+; SSE-NEXT: movss
+; AVX: vpsrld
+; AVX-NEXT: vpsrld
+; AVX-NEXT: vmovss
+; AVX2: vpsrlvd
+; CHECK: ret
+
+
+define <4 x i32> @test4(<4 x i32> %a) {
+  %lshr = lshr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test4
+; SSE: psrld
+; SSE-NEXT: psrld
+; SSE-NEXT: movsd
+; AVX: vpsrld
+; AVX-NEXT: vpsrld
+; AVX-NEXT: vmovsd
+; AVX2: vpsrlvd
+; CHECK: ret
+
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 2, i16 2, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+
+define <8 x i16> @test6(<8 x i16> %a) {
+  %lshr = ashr <8 x i16> %a, <i16 3, i16 3, i16 3, i16 3, i16 2, i16 2, i16 2, i16 2>
+  ret <8 x i16> %lshr
+}
+; CHECK-LABEL: test6
+; SSE: psraw
+; SSE-NEXT: psraw
+; SSE-NEXT: movsd
+; AVX: vpsraw
+; AVX-NEXT: vpsraw
+; AVX-NEXT: vmovsd
+; AVX2: vpsraw
+; AVX2-NEXT: vpsraw
+; AVX2-NEXT: vmovsd
+; CHECK: ret
+
+
+define <4 x i32> @test7(<4 x i32> %a) {
+  %lshr = ashr <4 x i32> %a, <i32 3, i32 2, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test7
+; SSE: psrad
+; SSE-NEXT: psrad
+; SSE-NEXT: movss
+; AVX: vpsrad
+; AVX-NEXT: vpsrad
+; AVX-NEXT: vmovss
+; AVX2: vpsravd
+; CHECK: ret
+
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %lshr = ashr <4 x i32> %a, <i32 3, i32 3, i32 2, i32 2>
+  ret <4 x i32> %lshr
+}
+; CHECK-LABEL: test8
+; SSE: psrad
+; SSE-NEXT: psrad
+; SSE-NEXT: movsd
+; AVX: vpsrad
+; AVX-NEXT: vpsrad
+; AVX-NEXT: vmovsd
+; AVX2: vpsravd
+; CHECK: ret
+
diff --git a/test/CodeGen/X86/lzcnt-tzcnt.ll b/test/CodeGen/X86/lzcnt-tzcnt.ll
new file mode 100644
index 0000000..07e4b9d
--- /dev/null
+++ b/test/CodeGen/X86/lzcnt-tzcnt.ll
@@ -0,0 +1,447 @@
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+lzcnt | FileCheck %s
+
+; LZCNT and TZCNT will always produce the operand size when the input operand
+; is zero. This test is to verify that we efficiently select LZCNT/TZCNT
+; based on the fact that the 'icmp+select' sequence is always redundant
+; in every function defined below.
+
+
+define i16 @test1_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test4_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test5_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test6_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test7_ctlz(i16 %v) {
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test7_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test8_ctlz(i32 %v) {
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test8_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test9_ctlz(i64 %v) {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test9_ctlz
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test10_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test10_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test11_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test11_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test12_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test12_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test13_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test13_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test14_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test14_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test15_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test15_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test16_ctlz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.ctlz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test16_ctlz
+; CHECK-NOT: movw
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test17_ctlz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.ctlz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test17_ctlz
+; CHECK-NOT: movd
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test18_ctlz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test18_ctlz
+; CHECK-NOT: movq
+; CHECK: lzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test1_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test1_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test2_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test2_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test3_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test3_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test4_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test4_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test5_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test5_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test6_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test6_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test7_cttz(i16 %v) {
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test7_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test8_cttz(i32 %v) {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test8_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test9_cttz(i64 %v) {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test9_cttz
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test10_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 %v, 0
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test10_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test11_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 %v, 0
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test11_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test12_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test12_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test13_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 16, i16 %cnt
+  ret i16 %cond
+}
+; CHECK-LABEL: test13_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test14_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 32, i32 %cnt
+  ret i32 %cond
+}
+; CHECK-LABEL: test14_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test15_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 64, i64 %cnt
+  ret i64 %cond
+}
+; CHECK-LABEL: test15_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i16 @test16_cttz(i16* %ptr) {
+  %v = load i16* %ptr
+  %cnt = tail call i16 @llvm.cttz.i16(i16 %v, i1 true)
+  %tobool = icmp eq i16 0, %v
+  %cond = select i1 %tobool, i16 %cnt, i16 16
+  ret i16 %cond
+}
+; CHECK-LABEL: test16_cttz
+; CHECK-NOT: movw
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i32 @test17_cttz(i32* %ptr) {
+  %v = load i32* %ptr
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %tobool = icmp eq i32 0, %v
+  %cond = select i1 %tobool, i32 %cnt, i32 32
+  ret i32 %cond
+}
+; CHECK-LABEL: test17_cttz
+; CHECK-NOT: movd
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+define i64 @test18_cttz(i64* %ptr) {
+  %v = load i64* %ptr
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 0, %v
+  %cond = select i1 %tobool, i64 %cnt, i64 64
+  ret i64 %cond
+}
+; CHECK-LABEL: test18_cttz
+; CHECK-NOT: movq
+; CHECK: tzcnt
+; CHECK-NEXT: ret
+
+
+declare i64 @llvm.cttz.i64(i64, i1)
+declare i32 @llvm.cttz.i32(i32, i1)
+declare i16 @llvm.cttz.i16(i16, i1)
+declare i64 @llvm.ctlz.i64(i64, i1)
+declare i32 @llvm.ctlz.i32(i32, i1)
+declare i16 @llvm.ctlz.i16(i16, i1)
+
diff --git a/test/CodeGen/X86/masked-iv-safe.ll b/test/CodeGen/X86/masked-iv-safe.ll
index 4a4d178..9ddc847 100644
--- a/test/CodeGen/X86/masked-iv-safe.ll
+++ b/test/CodeGen/X86/masked-iv-safe.ll
@@ -5,7 +5,7 @@
 
 ; CHECK-LABEL: count_up
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: inc
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up(double* %d, i64 %n) nounwind {
@@ -71,7 +71,7 @@ return:
 
 ; CHECK-LABEL: count_up_signed
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: inc
+; CHECK: incq
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @count_up_signed(double* %d, i64 %n) nounwind {
@@ -174,7 +174,7 @@ return:
 
 ; CHECK-LABEL: another_count_down
 ; CHECK-NOT: {{and|movz|sar|shl}}
-; CHECK: decq
+; CHECK: addq $-8,
 ; CHECK-NOT: {{and|movz|sar|shl}}
 ; CHECK: jne
 define void @another_count_down(double* %d, i64 %n) nounwind {
diff --git a/test/CodeGen/X86/merge_store.ll b/test/CodeGen/X86/merge_store.ll
index 940688c..f98963d 100644
--- a/test/CodeGen/X86/merge_store.ll
+++ b/test/CodeGen/X86/merge_store.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=x86-64 -addr-sink-using-gep=1 | FileCheck %s
 
 define void @merge_store(i32* nocapture %a) {
 ; CHECK-LABEL: merge_store:
diff --git a/test/CodeGen/X86/mod128.ll b/test/CodeGen/X86/mod128.ll
new file mode 100644
index 0000000..4fdee11
--- /dev/null
+++ b/test/CodeGen/X86/mod128.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s -mtriple=x86_64-linux | FileCheck %s -check-prefix=X86-64
+; RUN: llc < %s -mtriple=x86_64-cygwin | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-win32 | FileCheck %s -check-prefix=WIN64
+; RUN: llc < %s -mtriple=x86_64-mingw32 | FileCheck %s -check-prefix=WIN64
+
+define i64 @mod128(i128 %x) {
+  ; X86-64: movl  $3, %edx
+  ; X86-64: xorl  %ecx, %ecx
+  ; X86-64: callq __modti3
+  ; X86-64-NOT: movd %xmm0, %rax
+
+  ; WIN64-NOT: movl $3, %r8d
+  ; WIN64-NOT: xorl %r9d, %r9d
+  ; WIN64-DAG: movq %rdx, 56(%rsp)
+  ; WIN64-DAG: movq %rcx, 48(%rsp)
+  ; WIN64-DAG: leaq 48(%rsp), %rcx
+  ; WIN64-DAG: leaq 32(%rsp), %rdx
+  ; WIN64-DAG: movq $0, 40(%rsp)
+  ; WIN64-DAG: movq $3, 32(%rsp)
+  ; WIN64: callq   __modti3
+  ; WIN64: movd    %xmm0, %rax
+
+  %1 = srem i128 %x, 3
+  %2 = trunc i128 %1 to i64
+  ret i64 %2
+}
diff --git a/test/CodeGen/X86/musttail-indirect.ll b/test/CodeGen/X86/musttail-indirect.ll
new file mode 100644
index 0000000..9d21b5e
--- /dev/null
+++ b/test/CodeGen/X86/musttail-indirect.ll
@@ -0,0 +1,124 @@
+; RUN: llc < %s -mtriple=i686-win32 | FileCheck %s
+; RUN: llc < %s -mtriple=i686-win32 -O0 | FileCheck %s
+
+; IR simplified from the following C++ snippet compiled for i686-windows-msvc:
+
+; struct A { A(); ~A(); int a; };
+;
+; struct B {
+;   virtual int  f(int);
+;   virtual int  g(A, int, A);
+;   virtual void h(A, int, A);
+;   virtual A    i(A, int, A);
+;   virtual A    j(int);
+; };
+;
+; int  (B::*mp_f)(int)       = &B::f;
+; int  (B::*mp_g)(A, int, A) = &B::g;
+; void (B::*mp_h)(A, int, A) = &B::h;
+; A    (B::*mp_i)(A, int, A) = &B::i;
+; A    (B::*mp_j)(int)       = &B::j;
+
+; Each member pointer creates a thunk.  The ones with inalloca are required to
+; tail calls by the ABI, even at O0.
+
+%struct.B = type { i32 (...)** }
+%struct.A = type { i32 }
+
+; CHECK-LABEL: f_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @f_thunk(%struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, i32)***
+  %vtable = load i32 (%struct.B*, i32)*** %1
+  %2 = load i32 (%struct.B*, i32)** %vtable
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, i32 %0)
+  ret i32 %3
+}
+
+; Inalloca thunks shouldn't require any stores to the stack.
+; CHECK-LABEL: g_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc i32 @g_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc i32 %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: h_thunk:
+; CHECK: jmpl
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK-NOT: ret
+define x86_thiscallcc void @h_thunk(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)***
+  %vtable = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vtable, i32 2
+  %2 = load void (%struct.B*, <{ %struct.A, i32, %struct.A }>*)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.B* %this, <{ %struct.A, i32, %struct.A }>* inalloca %0)
+  ret void
+}
+
+; CHECK-LABEL: i_thunk:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc %struct.A* @i_thunk(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)***
+  %vtable = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vtable, i32 3
+  %2 = load %struct.A* (%struct.B*, <{ %struct.A*, %struct.A, i32, %struct.A }>*)** %vfn
+  %3 = musttail call x86_thiscallcc %struct.A* %2(%struct.B* %this, <{ %struct.A*, %struct.A, i32, %struct.A }>* inalloca %0)
+  ret %struct.A* %3
+}
+
+; CHECK-LABEL: j_thunk:
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_thiscallcc void @j_thunk(%struct.A* noalias sret %agg.result, %struct.B* %this, i32) {
+entry:
+  %1 = bitcast %struct.B* %this to void (%struct.A*, %struct.B*, i32)***
+  %vtable = load void (%struct.A*, %struct.B*, i32)*** %1
+  %vfn = getelementptr inbounds void (%struct.A*, %struct.B*, i32)** %vtable, i32 4
+  %2 = load void (%struct.A*, %struct.B*, i32)** %vfn
+  musttail call x86_thiscallcc void %2(%struct.A* sret %agg.result, %struct.B* %this, i32 %0)
+  ret void
+}
+
+; CHECK-LABEL: _stdcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_stdcallcc i32 @stdcall_thunk(<{ %struct.B*, %struct.A }>* inalloca) {
+entry:
+  %this_ptr = getelementptr inbounds <{ %struct.B*, %struct.A }>* %0, i32 0, i32 0
+  %this = load %struct.B** %this_ptr
+  %1 = bitcast %struct.B* %this to i32 (<{ %struct.B*, %struct.A }>*)***
+  %vtable = load i32 (<{ %struct.B*, %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (<{ %struct.B*, %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (<{ %struct.B*, %struct.A }>*)** %vfn
+  %3 = musttail call x86_stdcallcc i32 %2(<{ %struct.B*, %struct.A }>* inalloca %0)
+  ret i32 %3
+}
+
+; CHECK-LABEL: @fastcall_thunk@8:
+; CHECK-NOT: mov %{{.*}}, {{.*(.*esp.*)}}
+; CHECK: jmpl
+; CHECK-NOT: ret
+define x86_fastcallcc i32 @fastcall_thunk(%struct.B* inreg %this, <{ %struct.A }>* inalloca) {
+entry:
+  %1 = bitcast %struct.B* %this to i32 (%struct.B*, <{ %struct.A }>*)***
+  %vtable = load i32 (%struct.B*, <{ %struct.A }>*)*** %1
+  %vfn = getelementptr inbounds i32 (%struct.B*, <{ %struct.A }>*)** %vtable, i32 1
+  %2 = load i32 (%struct.B*, <{ %struct.A }>*)** %vfn
+  %3 = musttail call x86_fastcallcc i32 %2(%struct.B* inreg %this, <{ %struct.A }>* inalloca %0)
+  ret i32 %3
+}
diff --git a/test/CodeGen/X86/musttail-thiscall.ll b/test/CodeGen/X86/musttail-thiscall.ll
new file mode 100644
index 0000000..8ea1248
--- /dev/null
+++ b/test/CodeGen/X86/musttail-thiscall.ll
@@ -0,0 +1,31 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+define x86_thiscallcc void @t1(i8* %this) {
+  %adj = getelementptr i8* %this, i32 4
+  musttail call x86_thiscallcc void @t1_callee(i8* %adj)
+  ret void
+}
+declare x86_thiscallcc void @t1_callee(i8* %this)
+
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+define x86_thiscallcc i32 @t2(i8* %this, i32 %a) {
+  %adj = getelementptr i8* %this, i32 4
+  %rv = musttail call x86_thiscallcc i32 @t2_callee(i8* %adj, i32 %a)
+  ret i32 %rv
+}
+declare x86_thiscallcc i32 @t2_callee(i8* %this, i32 %a)
+
+; CHECK-LABEL: t3:
+; CHECK: jmp {{_?}}t3_callee
+define x86_thiscallcc i8* @t3(i8* %this, <{ i8*, i32 }>* inalloca %args) {
+  %adj = getelementptr i8* %this, i32 4
+  %a_ptr = getelementptr <{ i8*, i32 }>* %args, i32 0, i32 1
+  store i32 0, i32* %a_ptr
+  %rv = musttail call x86_thiscallcc i8* @t3_callee(i8* %adj, <{ i8*, i32 }>* inalloca %args)
+  ret i8* %rv
+}
+declare x86_thiscallcc i8* @t3_callee(i8* %this, <{ i8*, i32 }>* inalloca %args);
diff --git a/test/CodeGen/X86/musttail.ll b/test/CodeGen/X86/musttail.ll
new file mode 100644
index 0000000..ca5d311
--- /dev/null
+++ b/test/CodeGen/X86/musttail.ll
@@ -0,0 +1,90 @@
+; RUN: llc -march=x86 < %s | FileCheck %s
+; RUN: llc -march=x86 -O0 < %s | FileCheck %s
+; RUN: llc -march=x86 -disable-tail-calls < %s | FileCheck %s
+
+declare void @t1_callee(i8*)
+define void @t1(i32* %a) {
+; CHECK-LABEL: t1:
+; CHECK: jmp {{_?}}t1_callee
+  %b = bitcast i32* %a to i8*
+  musttail call void @t1_callee(i8* %b)
+  ret void
+}
+
+declare i8* @t2_callee()
+define i32* @t2() {
+; CHECK-LABEL: t2:
+; CHECK: jmp {{_?}}t2_callee
+  %v = musttail call i8* @t2_callee()
+  %w = bitcast i8* %v to i32*
+  ret i32* %w
+}
+
+; Complex frame layout: stack realignment with dynamic alloca.
+define void @t3(i32 %n) alignstack(32) nounwind {
+entry:
+; CHECK: t3:
+; CHECK: pushl %ebp
+; CHECK: pushl %esi
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK-NEXT: jmp {{_?}}t3_callee
+  %a = alloca i8, i32 %n
+  call void @capture(i8* %a)
+  musttail call void @t3_callee(i32 %n) nounwind
+  ret void
+}
+
+declare void @capture(i8*)
+declare void @t3_callee(i32)
+
+; Test that we actually copy in and out stack arguments that aren't forwarded
+; without modification.
+define i32 @t4({}* %fn, i32 %n, i32 %r) {
+; CHECK-LABEL: t4:
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; CHECK: movl %[[r]], {{[0-9]+}}(%esp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%esp)
+; CHECK: jmpl *%{{.*}}
+
+entry:
+  %r1 = add i32 %r, 1
+  %n1 = sub i32 %n, 1
+  %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+  %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+  ret i32 %r2
+}
+
+; Combine the complex stack frame with the parameter modification.
+define i32 @t5({}* %fn, i32 %n, i32 %r) alignstack(32) {
+; CHECK-LABEL: t5:
+; CHECK: pushl %ebp
+; CHECK: movl %esp, %ebp
+; CHECK: pushl %esi
+; 	Align the stack.
+; CHECK: andl $-32, %esp
+; CHECK: movl %esp, %esi
+; 	Modify the args.
+; CHECK: incl %[[r:.*]]
+; CHECK: decl %[[n:.*]]
+; 	Store them through ebp, since that's the only stable arg pointer.
+; CHECK: movl %[[r]], {{[0-9]+}}(%ebp)
+; CHECK: movl %[[n]], {{[0-9]+}}(%ebp)
+; 	Epilogue.
+; CHECK: leal {{[-0-9]+}}(%ebp), %esp
+; CHECK: popl %esi
+; CHECK: popl %ebp
+; CHECK: jmpl *%{{.*}}
+
+entry:
+  %a = alloca i8, i32 %n
+  call void @capture(i8* %a)
+  %r1 = add i32 %r, 1
+  %n1 = sub i32 %n, 1
+  %fn_cast = bitcast {}* %fn to i32 ({}*, i32, i32)*
+  %r2 = musttail call i32 %fn_cast({}* %fn, i32 %n1, i32 %r1)
+  ret i32 %r2
+}
diff --git a/test/CodeGen/X86/named-reg-alloc.ll b/test/CodeGen/X86/named-reg-alloc.ll
new file mode 100644
index 0000000..9463ea3
--- /dev/null
+++ b/test/CodeGen/X86/named-reg-alloc.ll
@@ -0,0 +1,14 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; FIXME: Include an allocatable-specific error message
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"eax\00"}
diff --git a/test/CodeGen/X86/named-reg-notareg.ll b/test/CodeGen/X86/named-reg-notareg.ll
new file mode 100644
index 0000000..d85dddd
--- /dev/null
+++ b/test/CodeGen/X86/named-reg-notareg.ll
@@ -0,0 +1,13 @@
+; RUN: not llc < %s -mtriple=x86_64-apple-darwin 2>&1 | FileCheck %s
+; RUN: not llc < %s -mtriple=x86_64-linux-gnueabi 2>&1 | FileCheck %s
+
+define i32 @get_stack() nounwind {
+entry:
+; CHECK: Invalid register name global variable
+	%sp = call i32 @llvm.read_register.i32(metadata !0)
+  ret i32 %sp
+}
+
+declare i32 @llvm.read_register.i32(metadata) nounwind
+
+!0 = metadata !{metadata !"notareg\00"}
diff --git a/test/CodeGen/X86/no-cfi.ll b/test/CodeGen/X86/no-cfi.ll
deleted file mode 100644
index 5bb9bb2..0000000
--- a/test/CodeGen/X86/no-cfi.ll
+++ /dev/null
@@ -1,34 +0,0 @@
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi | FileCheck --check-prefix=STATIC %s
-; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu -disable-cfi -relocation-model=pic | FileCheck --check-prefix=PIC %s
-
-; STATIC:      .ascii   "zPLR"
-; STATIC:      .byte   3
-; STATIC-NEXT: .long   __gxx_personality_v0
-; STATIC-NEXT: .byte   3
-; STATIC-NEXT: .byte   3
-
-; PIC:      .ascii   "zPLR"
-; PIC:      .byte   155
-; PIC-NEXT: .L
-; PIC-NEXT: .long   DW.ref.__gxx_personality_v0-.L
-; PIC-NEXT: .byte   27
-; PIC-NEXT: .byte   27
-
-
-define void @bar() {
-entry:
-  %call = invoke i32 @foo()
-          to label %invoke.cont unwind label %lpad
-
-invoke.cont:
-  ret void
-
-lpad:
-  %exn = landingpad {i8*, i32} personality i32 (...)* @__gxx_personality_v0
-            catch i8* null
-  ret void
-}
-
-declare i32 @foo()
-
-declare i32 @__gxx_personality_v0(...)
diff --git a/test/CodeGen/X86/peep-test-4.ll b/test/CodeGen/X86/peep-test-4.ll
index 884ee7c..1ae621f 100644
--- a/test/CodeGen/X86/peep-test-4.ll
+++ b/test/CodeGen/X86/peep-test-4.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-linux -mattr=+bmi,+bmi2,+popcnt,+lzcnt | FileCheck %s
 declare void @foo(i32)
+declare void @foo32(i32)
 declare void @foo64(i64)
 
 ; CHECK-LABEL: neg:
@@ -189,3 +190,76 @@ bb:
 return:
   ret void
 }
+
+; CHECK-LABEL: testCTZ
+; CHECK: tzcntq
+; CHECK-NOT: test
+; CHECK: cmovaeq
+declare i64 @llvm.cttz.i64(i64, i1)
+define i64 @testCTZ(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.cttz.i64(i64 %v, i1 true)
+  %tobool = icmp eq i64 %v, 0
+  %cond = select i1 %tobool, i64 255, i64 %cnt
+  ret i64 %cond
+}
+
+; CHECK-LABEL: testCTZ2
+; CHECK: tzcntl
+; CHECK-NEXT: jb
+; CHECK: jmp foo
+declare i32 @llvm.cttz.i32(i32, i1)
+define void @testCTZ2(i32 %v) nounwind {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %cmp = icmp eq i32 %v, 0
+  br i1 %cmp, label %return, label %bb
+
+bb:
+  tail call void @foo(i32 %cnt)
+  br label %return
+
+return:
+  tail call void @foo32(i32 %cnt)
+  ret void
+}
+
+; CHECK-LABEL: testCTZ3
+; CHECK: tzcntl
+; CHECK-NEXT: jae
+; CHECK: jmp foo
+define void @testCTZ3(i32 %v) nounwind {
+  %cnt = tail call i32 @llvm.cttz.i32(i32 %v, i1 true)
+  %cmp = icmp ne i32 %v, 0
+  br i1 %cmp, label %return, label %bb
+
+bb:
+  tail call void @foo(i32 %cnt)
+  br label %return
+
+return:
+  tail call void @foo32(i32 %cnt)
+  ret void
+}
+
+; CHECK-LABEL: testCLZ
+; CHECK: lzcntq
+; CHECK-NOT: test
+; CHECK: cmovaeq
+declare i64 @llvm.ctlz.i64(i64, i1)
+define i64 @testCLZ(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.ctlz.i64(i64 %v, i1 true)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 255
+  ret i64 %cond
+}
+
+; CHECK-LABEL: testPOPCNT
+; CHECK: popcntq
+; CHECK-NOT: test
+; CHECK: cmovneq
+declare i64 @llvm.ctpop.i64(i64)
+define i64 @testPOPCNT(i64 %v) nounwind {
+  %cnt = tail call i64 @llvm.ctpop.i64(i64 %v)
+  %tobool = icmp ne i64 %v, 0
+  %cond = select i1 %tobool, i64 %cnt, i64 255
+  ret i64 %cond
+}
diff --git a/test/CodeGen/X86/peephole-multiple-folds.ll b/test/CodeGen/X86/peephole-multiple-folds.ll
index d184569..a6cec66 100644
--- a/test/CodeGen/X86/peephole-multiple-folds.ll
+++ b/test/CodeGen/X86/peephole-multiple-folds.ll
@@ -9,8 +9,8 @@ entry:
 
 loopbody:
 ; CHECK: test_peephole_multi_fold:
-; CHECK: vfmadd231ps (%rdi),
-; CHECK: vfmadd231ps (%rsi),
+; CHECK: vfmadd231ps ({{%rdi|%rcx}}),
+; CHECK: vfmadd231ps ({{%rsi|%rdx}}),
   %vsum1 = phi <8 x float> [ %vsum1.next, %loopbody ], [ zeroinitializer, %entry ]
   %vsum2 = phi <8 x float> [ %vsum2.next, %loopbody ], [ zeroinitializer, %entry ]
   %m1 = load <8 x float>* %p1, align 1
diff --git a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
index f3669fb..d8e4572 100644
--- a/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
+++ b/test/CodeGen/X86/ragreedy-last-chance-recoloring.ll
@@ -2,6 +2,16 @@
 ; Without the last chance recoloring, this test fails with:
 ; "ran out of registers".
 
+; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-depth=0  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-DEPTH
+; Test whether failure due to cutoff for depth is reported
+
+; RUN: not llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1  < %s 2>&1 | FileCheck %s --check-prefix=CHECK-INTERF
+; Test whether failure due to cutoff for interference is reported
+
+; RUN: llc -regalloc=greedy -relocation-model=pic -lcr-max-interf=1 -lcr-max-depth=0 -exhaustive-register-search < %s > %t 2>&1
+; RUN: FileCheck --input-file=%t %s --check-prefix=CHECK-EXHAUSTIVE
+; Test whether exhaustive-register-search can bypass the depth and interference cutoffs of last chance recoloring 
+
 target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
 target triple = "i386-apple-macosx"
 
@@ -12,6 +22,9 @@ target triple = "i386-apple-macosx"
 
 ; Function Attrs: nounwind ssp
 ; CHECK-NOT: ran out of registers during register allocation
+; CHECK-INTERF: error: register allocation failed: maximum interference for recoloring reached
+; CHECK-DEPTH: error: register allocation failed: maximum depth for recoloring reached
+; CHECK-EXHAUSTIVE-NOT: error: register allocation failed: maximum {{depth|interference}} for recoloring reached
 define void @fp_dh_f870bf31fd8ffe068450366e3f05389a(i8* %arg) #0 {
 bb:
   indirectbr i8* undef, [label %bb85, label %bb206]
diff --git a/test/CodeGen/X86/rdtsc.ll b/test/CodeGen/X86/rdtsc.ll
index f21a44c..dba614a 100644
--- a/test/CodeGen/X86/rdtsc.ll
+++ b/test/CodeGen/X86/rdtsc.ll
@@ -1,8 +1,49 @@
-; RUN: llc < %s -march=x86 | grep rdtsc
-; RUN: llc < %s -march=x86-64 | grep rdtsc
-declare i64 @llvm.readcyclecounter()
+; RUN: llc < %s -march=x86-64 -mcpu=generic | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=generic | FileCheck %s --check-prefix=CHECK --check-prefix=X86
+
+; Verify that we correctly lower ISD::READCYCLECOUNTER.
+
+
+define i64 @test_builtin_readcyclecounter() {
+  %1 = tail call i64 @llvm.readcyclecounter()
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_readcyclecounter
+; CHECK: rdtsc
+; X86-NOT: shlq
+; X86-NOT: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+
+; Verify that we correctly lower the Read Cycle Counter GCC x86 builtins
+; (i.e. RDTSC and RDTSCP).
 
-define i64 @foo() {
-	%tmp.1 = call i64 @llvm.readcyclecounter( )		; <i64> [#uses=1]
-	ret i64 %tmp.1
+define i64 @test_builtin_rdtsc() {
+  %1 = tail call i64 @llvm.x86.rdtsc()
+  ret i64 %1
 }
+; CHECK-LABEL: test_builtin_rdtsc
+; CHECK: rdtsc
+; X86-NOT: shlq
+; X86-NOT: or
+; CHECK-NOT: mov
+; CHECK: ret
+
+
+define i64 @test_builtin_rdtscp(i8* %A) {
+  %1 = tail call i64 @llvm.x86.rdtscp(i8* %A)
+  ret i64 %1
+}
+; CHECK-LABEL: test_builtin_rdtscp
+; CHECK: rdtscp
+; X86-NOT: shlq
+; CHECK:   movl	%ecx, (%{{[a-z0-9]+}})
+; X86-NOT: shlq
+; CHECK: ret
+
+
+declare i64 @llvm.readcyclecounter()
+declare i64 @llvm.x86.rdtscp(i8*)
+declare i64 @llvm.x86.rdtsc()
+
diff --git a/test/CodeGen/X86/remat-invalid-liveness.ll b/test/CodeGen/X86/remat-invalid-liveness.ll
new file mode 100644
index 0000000..d285e83
--- /dev/null
+++ b/test/CodeGen/X86/remat-invalid-liveness.ll
@@ -0,0 +1,85 @@
+; RUN: llc %s -mcpu=core2 -o - | FileCheck %s
+; This test was failing while tracking the liveness in the register scavenger
+; during the branching folding pass. The allocation of the subregisters was
+; incorrect.
+; I.e., the faulty pattern looked like:
+; CH = movb 64
+; ECX = movl 3 <- CH was killed here.
+; CH = subb CH, ...
+;
+; This reduced test case triggers the crash before the fix, but does not
+; strictly speaking check that the resulting code is correct.
+; To check that the code is actually correct we would need to check the
+; liveness of the produced code.
+;
+; Currently, we check that after ECX = movl 3, we do not have subb CH,
+; whereas CH could have been redefine in between and that would have been
+; totally fine.
+; <rdar://problem/16582185>
+target datalayout = "e-m:o-p:32:32-f64:32:64-f80:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.9"
+
+%struct.A = type { %struct.B, %struct.C, %struct.D*, [1 x i8*] }
+%struct.B = type { i32, [4 x i8] }
+%struct.C = type { i128 }
+%struct.D = type { {}*, [0 x i32] }
+%union.E = type { i32 }
+
+; CHECK-LABEL: __XXX1:
+; CHECK: movl $3, %ecx
+; CHECK-NOT: subb %{{[a-z]+}}, %ch
+; Function Attrs: nounwind optsize ssp
+define fastcc void @__XXX1(%struct.A* %ht) #0 {
+entry:
+  %const72 = bitcast i128 72 to i128
+  %const3 = bitcast i128 3 to i128
+  switch i32 undef, label %if.end196 [
+    i32 1, label %sw.bb.i
+    i32 3, label %sw.bb2.i
+  ]
+
+sw.bb.i:                                          ; preds = %entry
+  %call.i.i.i = tail call i32 undef(%struct.A* %ht, i8 zeroext 22, i32 undef, i32 0, %struct.D* undef)
+  %bf.load.i.i = load i128* undef, align 4
+  %bf.lshr.i.i = lshr i128 %bf.load.i.i, %const72
+  %shl1.i.i = shl nuw nsw i128 %bf.lshr.i.i, 8
+  %shl.i.i = trunc i128 %shl1.i.i to i32
+  br i1 undef, label %cond.false10.i.i, label %__XXX2.exit.i.i
+
+__XXX2.exit.i.i:                    ; preds = %sw.bb.i
+  %extract11.i.i.i = lshr i128 %bf.load.i.i, %const3
+  %extract.t12.i.i.i = trunc i128 %extract11.i.i.i to i32
+  %bf.cast7.i.i.i = and i32 %extract.t12.i.i.i, 3
+  %arrayidx.i.i.i = getelementptr inbounds %struct.A* %ht, i32 0, i32 3, i32 %bf.cast7.i.i.i
+  br label %cond.end12.i.i
+
+cond.false10.i.i:                                 ; preds = %sw.bb.i
+  %arrayidx.i6.i.i = getelementptr inbounds %struct.A* %ht, i32 0, i32 3, i32 0
+  br label %cond.end12.i.i
+
+cond.end12.i.i:                                   ; preds = %cond.false10.i.i, %__XXX2.exit.i.i
+  %.sink.in.i.i = phi i8** [ %arrayidx.i.i.i, %__XXX2.exit.i.i ], [ %arrayidx.i6.i.i, %cond.false10.i.i ]
+  %.sink.i.i = load i8** %.sink.in.i.i, align 4
+  %tmp = bitcast i8* %.sink.i.i to %union.E*
+  br i1 undef, label %for.body.i.i, label %if.end196
+
+for.body.i.i:                                     ; preds = %for.body.i.i, %cond.end12.i.i
+  %weak.i.i = getelementptr inbounds %union.E* %tmp, i32 undef, i32 0
+  %tmp1 = load i32* %weak.i.i, align 4
+  %cmp36.i.i = icmp ne i32 %tmp1, %shl.i.i
+  %or.cond = and i1 %cmp36.i.i, false
+  br i1 %or.cond, label %for.body.i.i, label %if.end196
+
+sw.bb2.i:                                         ; preds = %entry
+  %bf.lshr.i85.i = lshr i128 undef, %const72
+  br i1 undef, label %if.end196, label %__XXX2.exit.i95.i
+
+__XXX2.exit.i95.i:                  ; preds = %sw.bb2.i
+  %extract11.i.i91.i = lshr i128 undef, %const3
+  br label %if.end196
+
+if.end196:                                        ; preds = %__XXX2.exit.i95.i, %sw.bb2.i, %for.body.i.i, %cond.end12.i.i, %entry
+  ret void
+}
+
+attributes #0 = { nounwind optsize ssp "no-frame-pointer-elim"="true" "no-frame-pointer-elim-non-leaf" }
diff --git a/test/CodeGen/X86/ret-mmx.ll b/test/CodeGen/X86/ret-mmx.ll
index 091fd53..fc9c78d 100644
--- a/test/CodeGen/X86/ret-mmx.ll
+++ b/test/CodeGen/X86/ret-mmx.ll
@@ -34,6 +34,7 @@ define double @t4() nounwind {
 	ret double bitcast (<2 x i32> <i32 1, i32 0> to double)
 ; CHECK-LABEL: t4:
 ; CHECK: movl $1
+; CHECK-NOT: pshufd
 ; CHECK: movd {{.*}}, %xmm0
 }
 
diff --git a/test/CodeGen/X86/rotate3.ll b/test/CodeGen/X86/rotate3.ll
deleted file mode 100644
index b92f7c2..0000000
--- a/test/CodeGen/X86/rotate3.ll
+++ /dev/null
@@ -1,76 +0,0 @@
-; Check that (or (shl x, y), (srl x, (sub 32, y))) is folded into (rotl x, y)
-; and (or (shl x, (sub 32, y)), (srl x, r)) into (rotr x, y) even if the
-; argument is zero extended. Fix for PR16726.
-
-; RUN: llc < %s -march=x86-64 -mcpu=corei7 | FileCheck %s
-
-define zeroext i8 @rolbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i8 %x_arg to i32
-  %tmp3 = shl i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 8, %nBits_arg
-  %tmp10 = lshr i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i8
-  ret i8 %tmp12
-}
-; CHECK:    rolb %cl, %{{[a-z0-9]+}}
-
-
-define zeroext i8 @rorbyte(i32 %nBits_arg, i8 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i8 %x_arg to i32
-  %tmp3 = lshr i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 8, %nBits_arg
-  %tmp10 = shl i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i8
-  ret i8 %tmp12
-}
-; CHECK:    rorb %cl, %{{[a-z0-9]+}}
-
-define zeroext i16 @rolword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i16 %x_arg to i32
-  %tmp3 = shl i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 16, %nBits_arg
-  %tmp10 = lshr i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i16
-  ret i16 %tmp12
-}
-; CHECK:    rolw %cl, %{{[a-z0-9]+}}
-
-define zeroext i16 @rorword(i32 %nBits_arg, i16 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i16 %x_arg to i32
-  %tmp3 = lshr i32 %tmp1, %nBits_arg
-  %tmp8 = sub i32 16, %nBits_arg
-  %tmp10 = shl i32 %tmp1, %tmp8
-  %tmp11 = or i32 %tmp3, %tmp10
-  %tmp12 = trunc i32 %tmp11 to i16
-  ret i16 %tmp12
-}
-; CHECK:    rorw %cl, %{{[a-z0-9]+}}
-
-define i64 @roldword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i32 %x_arg to i64
-  %tmp3 = shl i64 %tmp1, %nBits_arg
-  %tmp8 = sub i64 32, %nBits_arg
-  %tmp10 = lshr i64 %tmp1, %tmp8
-  %tmp11 = or i64 %tmp3, %tmp10
-  ret i64 %tmp11
-}
-; CHECK:    roll %cl, %{{[a-z0-9]+}}
-
-define zeroext i64 @rordword(i64 %nBits_arg, i32 %x_arg) nounwind readnone {
-entry:
-  %tmp1 = zext i32 %x_arg to i64
-  %tmp3 = lshr i64 %tmp1, %nBits_arg
-  %tmp8 = sub i64 32, %nBits_arg
-  %tmp10 = shl i64 %tmp1, %tmp8
-  %tmp11 = or i64 %tmp3, %tmp10
-  ret i64 %tmp11
-}
-; CHECK:    rorl %cl, %{{[a-z0-9]+}}
diff --git a/test/CodeGen/X86/segmented-stacks-dynamic.ll b/test/CodeGen/X86/segmented-stacks-dynamic.ll
index e170762..b82be41 100644
--- a/test/CodeGen/X86/segmented-stacks-dynamic.ll
+++ b/test/CodeGen/X86/segmented-stacks-dynamic.ll
@@ -1,12 +1,12 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -segmented-stacks -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -verify-machineinstrs | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
 
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define i32 @test_basic(i32 %l) {
+define i32 @test_basic(i32 %l) #0 {
         %mem = alloca i32, i32 %l
         call void @dummy_use (i32* %mem, i32 %l)
         %terminate = icmp eq i32 %l, 0
@@ -62,3 +62,5 @@ false:
 ; X64:      movq %rax, %rdi
 
 }
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/segmented-stacks.ll b/test/CodeGen/X86/segmented-stacks.ll
index c02152b..9dab3cd 100644
--- a/test/CodeGen/X86/segmented-stacks.ll
+++ b/test/CodeGen/X86/segmented-stacks.ll
@@ -1,23 +1,23 @@
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -verify-machineinstrs | FileCheck %s -check-prefix=X32-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux  -verify-machineinstrs | FileCheck %s -check-prefix=X64-Linux
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X32-Darwin
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -verify-machineinstrs | FileCheck %s -check-prefix=X64-Darwin
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X32-MinGW
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -verify-machineinstrs | FileCheck %s -check-prefix=X64-FreeBSD
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -verify-machineinstrs | FileCheck %s -check-prefix=X64-MinGW
 
 ; We used to crash with filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -segmented-stacks -filetype=obj
-; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -segmented-stacks -filetype=obj
-
-; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris -segmented-stacks 2> %t.log
+; RUN: llc < %s -mcpu=generic -mtriple=i686-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-linux -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-darwin -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-darwin -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=i686-mingw32 -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-freebsd -filetype=obj
+; RUN: llc < %s -mcpu=generic -mtriple=x86_64-mingw32 -filetype=obj
+
+; RUN: not llc < %s -mcpu=generic -mtriple=x86_64-solaris 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X64-Solaris
-; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd -segmented-stacks 2> %t.log
+; RUN: not llc < %s -mcpu=generic -mtriple=i686-freebsd 2> %t.log
 ; RUN: FileCheck %s -input-file=%t.log -check-prefix=X32-FreeBSD
 
 ; X64-Solaris: Segmented stacks not supported on this platform
@@ -26,7 +26,7 @@
 ; Just to prevent the alloca from being optimized away
 declare void @dummy_use(i32*, i32)
 
-define void @test_basic() {
+define void @test_basic() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
 	ret void
@@ -104,16 +104,18 @@ define void @test_basic() {
 
 }
 
-define i32 @test_nested(i32 * nest %closure, i32 %other) {
+define i32 @test_nested(i32 * nest %closure, i32 %other) #0 {
        %addend = load i32 * %closure
        %result = add i32 %other, %addend
+       %mem = alloca i32, i32 10
+       call void @dummy_use (i32* %mem, i32 10)
        ret i32 %result
 
 ; X32-Linux:       cmpl %gs:48, %esp
 ; X32-Linux-NEXT:  ja      .LBB1_2
 
 ; X32-Linux:       pushl $4
-; X32-Linux-NEXT:  pushl $0
+; X32-Linux-NEXT:  pushl $60
 ; X32-Linux-NEXT:  calll __morestack
 ; X32-Linux-NEXT:  ret
 
@@ -121,7 +123,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-Linux-NEXT:  ja      .LBB1_2
 
 ; X64-Linux:       movq %r10, %rax
-; X64-Linux-NEXT:  movabsq $0, %r10
+; X64-Linux-NEXT:  movabsq $56, %r10
 ; X64-Linux-NEXT:  movabsq $0, %r11
 ; X64-Linux-NEXT:  callq __morestack
 ; X64-Linux-NEXT:  ret
@@ -132,7 +134,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-Darwin-NEXT: ja      LBB1_2
 
 ; X32-Darwin:      pushl $4
-; X32-Darwin-NEXT: pushl $0
+; X32-Darwin-NEXT: pushl $60
 ; X32-Darwin-NEXT: calll ___morestack
 ; X32-Darwin-NEXT: ret
 
@@ -140,7 +142,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-Darwin-NEXT: ja      LBB1_2
 
 ; X64-Darwin:      movq %r10, %rax
-; X64-Darwin-NEXT: movabsq $0, %r10
+; X64-Darwin-NEXT: movabsq $56, %r10
 ; X64-Darwin-NEXT: movabsq $0, %r11
 ; X64-Darwin-NEXT: callq ___morestack
 ; X64-Darwin-NEXT: ret
@@ -150,7 +152,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X32-MinGW-NEXT:  ja      LBB1_2
 
 ; X32-MinGW:       pushl $4
-; X32-MinGW-NEXT:  pushl $0
+; X32-MinGW-NEXT:  pushl $52
 ; X32-MinGW-NEXT:  calll ___morestack
 ; X32-MinGW-NEXT:  ret
 
@@ -159,7 +161,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-MinGW-NEXT:  ja      .LBB1_2
 
 ; X64-MinGW:       movq %r10, %rax
-; X64-MinGW-NEXT:  movabsq $0, %r10
+; X64-MinGW-NEXT:  movabsq $88, %r10
 ; X64-MinGW-NEXT:  movabsq $32, %r11
 ; X64-MinGW-NEXT:  callq __morestack
 ; X64-MinGW-NEXT:  retq
@@ -169,7 +171,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 ; X64-FreeBSD-NEXT:  ja      .LBB1_2
 
 ; X64-FreeBSD:       movq %r10, %rax
-; X64-FreeBSD-NEXT:  movabsq $0, %r10
+; X64-FreeBSD-NEXT:  movabsq $56, %r10
 ; X64-FreeBSD-NEXT:  movabsq $0, %r11
 ; X64-FreeBSD-NEXT:  callq __morestack
 ; X64-FreeBSD-NEXT:  ret
@@ -177,7 +179,7 @@ define i32 @test_nested(i32 * nest %closure, i32 %other) {
 
 }
 
-define void @test_large() {
+define void @test_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -249,7 +251,7 @@ define void @test_large() {
 
 }
 
-define fastcc void @test_fastcc() {
+define fastcc void @test_fastcc() #0 {
         %mem = alloca i32, i32 10
         call void @dummy_use (i32* %mem, i32 10)
         ret void
@@ -327,7 +329,7 @@ define fastcc void @test_fastcc() {
 
 }
 
-define fastcc void @test_fastcc_large() {
+define fastcc void @test_fastcc_large() #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 0)
         ret void
@@ -412,7 +414,7 @@ define fastcc void @test_fastcc_large() {
 
 }
 
-define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
+define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) #0 {
         %mem = alloca i32, i32 10000
         call void @dummy_use (i32* %mem, i32 %a)
         ret void
@@ -434,3 +436,30 @@ define fastcc void @test_fastcc_large_with_ecx_arg(i32 %a) {
 ; X32-Darwin-NEXT: ret
 
 }
+
+define void @test_nostack() #0 {
+	ret void
+
+; X32-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   calll __morestack
+
+; X64-Linux-LABEL: test_nostack:
+; X32-Linux-NOT:   callq __morestack
+
+; X32-Darwin-LABEL: test_nostack:
+; X32-Darwin-NOT:   calll __morestack
+
+; X64-Darwin-LABEL: test_nostack:
+; X64-Darwin-NOT:   callq __morestack
+
+; X32-MinGW-LABEL: test_nostack:
+; X32-MinGW-NOT:   calll __morestack
+
+; X64-MinGW-LABEL: test_nostack:
+; X64-MinGW-NOT:   callq __morestack
+
+; X64-FreeBSD-LABEL: test_nostack:
+; X64-FreeBSD-NOT:   callq __morestack
+}
+
+attributes #0 = { "split-stack" }
diff --git a/test/CodeGen/X86/sse2.ll b/test/CodeGen/X86/sse2.ll
index 628dba0..e8d3d6f 100644
--- a/test/CodeGen/X86/sse2.ll
+++ b/test/CodeGen/X86/sse2.ll
@@ -221,3 +221,21 @@ entry:
  %double2float.i = fptrunc <4 x double> %0 to <4 x float>
  ret <4 x float> %double2float.i
 }
+
+define <2 x i64> @test_insert_64_zext(<2 x i64> %i) {
+; CHECK-LABEL: test_insert_64_zext
+; CHECK-NOT: xor
+; CHECK: movq
+  %1 = shufflevector <2 x i64> %i, <2 x i64> <i64 0, i64 undef>, <2 x i32> <i32 0, i32 2>
+  ret <2 x i64> %1
+}
+
+define <4 x i32> @PR19721(<4 x i32> %i) {
+  %bc = bitcast <4 x i32> %i to i128
+  %insert = and i128 %bc, -4294967296
+  %bc2 = bitcast i128 %insert to <4 x i32>
+  ret <4 x i32> %bc2
+
+; CHECK-LABEL: PR19721
+; CHECK: punpckldq
+}
diff --git a/test/CodeGen/X86/sse3.ll b/test/CodeGen/X86/sse3.ll
index 6d5b192..18bdcb3 100644
--- a/test/CodeGen/X86/sse3.ll
+++ b/test/CodeGen/X86/sse3.ll
@@ -209,7 +209,7 @@ entry:
 ; X64-LABEL: t13:
 ; X64: 	punpcklqdq	%xmm0, %xmm1
 ; X64: 	pextrw	$3, %xmm1, %eax
-; X64: 	pshufd	$52, %xmm1, %xmm0
+; X64: 	pshufhw	$12, %xmm1, %xmm0
 ; X64: 	pinsrw	$4, %eax, %xmm0
 ; X64: 	ret
 }
diff --git a/test/CodeGen/X86/sse41-blend.ll b/test/CodeGen/X86/sse41-blend.ll
index 4681fde..8ad7987 100644
--- a/test/CodeGen/X86/sse41-blend.ll
+++ b/test/CodeGen/X86/sse41-blend.ll
@@ -1,7 +1,7 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7 -mattr=+sse4.1 | FileCheck %s
 
 ;CHECK-LABEL: vsel_float:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x float> %v1, <4 x float> %v2
@@ -10,7 +10,7 @@ define <4 x float> @vsel_float(<4 x float> %v1, <4 x float> %v2) {
 
 
 ;CHECK-LABEL: vsel_4xi8:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i8> %v1, <4 x i8> %v2
@@ -18,7 +18,7 @@ define <4 x i8> @vsel_4xi8(<4 x i8> %v1, <4 x i8> %v2) {
 }
 
 ;CHECK-LABEL: vsel_4xi16:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 false, i1 true, i1 true>, <4 x i16> %v1, <4 x i16> %v2
@@ -27,7 +27,7 @@ define <4 x i16> @vsel_4xi16(<4 x i16> %v1, <4 x i16> %v2) {
 
 
 ;CHECK-LABEL: vsel_i32:
-;CHECK: blendvps
+;CHECK: blendps
 ;CHECK: ret
 define <4 x i32> @vsel_i32(<4 x i32> %v1, <4 x i32> %v2) {
   %vsel = select <4 x i1> <i1 true, i1 true, i1 false, i1 true>, <4 x i32> %v1, <4 x i32> %v2
@@ -88,3 +88,35 @@ entry:
   store double %extract214vector_func.i, double addrspace(1)* undef, align 8
   ret void
 }
+
+; If we can figure out a blend has a constant mask, we should emit the
+; blend instruction with an immediate mask
+define <2 x double> @constant_blendvpd(<2 x double> %xy, <2 x double> %ab) {
+; In this case, we emit a simple movss
+; CHECK-LABEL: constant_blendvpd
+; CHECK: movsd
+; CHECK: ret
+  %1 = select <2 x i1> <i1 true, i1 false>, <2 x double> %xy, <2 x double> %ab
+  ret <2 x double> %1
+}
+
+define <4 x float> @constant_blendvps(<4 x float> %xyzw, <4 x float> %abcd) {
+; CHECK-LABEL: constant_blendvps
+; CHECK-NOT: mov
+; CHECK: blendps $7
+; CHECK: ret
+  %1 = select <4 x i1> <i1 false, i1 false, i1 false, i1 true>, <4 x float> %xyzw, <4 x float> %abcd
+  ret <4 x float> %1
+}
+
+define <16 x i8> @constant_pblendvb(<16 x i8> %xyzw, <16 x i8> %abcd) {
+; CHECK-LABEL: constant_pblendvb:
+; CHECK: movaps
+; CHECK: pblendvb
+; CHECK: ret
+  %1 = select <16 x i1> <i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false, i1 false, i1 false, i1 true, i1 false, i1 true, i1 true, i1 true, i1 false>, <16 x i8> %xyzw, <16 x i8> %abcd
+  ret <16 x i8> %1
+}
+declare <16 x i8> @llvm.x86.sse41.pblendvb(<16 x i8>, <16 x i8>, <16 x i8>)
+declare <4 x float> @llvm.x86.sse41.blendvps(<4 x float>, <4 x float>, <4 x float>)
+declare <2 x double> @llvm.x86.sse41.blendvpd(<2 x double>, <2 x double>, <2 x double>)
diff --git a/test/CodeGen/X86/sse41.ll b/test/CodeGen/X86/sse41.ll
index c15e24c..a3c6201 100644
--- a/test/CodeGen/X86/sse41.ll
+++ b/test/CodeGen/X86/sse41.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32
-; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64
+; RUN: llc < %s -mtriple=i686-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X32 --check-prefix=CHECK
+; RUN: llc < %s -mtriple=x86_64-apple-darwin9 -mattr=sse4.1 -mcpu=penryn | FileCheck %s -check-prefix=X64 --check-prefix=CHECK
 
 @g16 = external global i16
 
@@ -249,3 +249,446 @@ entry:
 ; X64: ret
 }
 
+define <4 x float> @insertps_from_shufflevector_1(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x float>* %pb, align 16
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_1:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: insertps    $48,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_shufflevector_2(<4 x float> %a, <4 x float> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x float> %a, <4 x float> %b, <4 x i32> <i32 0, i32 1, i32 5, i32 3>
+  ret <4 x float> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_2:
+; CHECK-NOT: shufps
+; CHECK: insertps    $96,
+; CHECK: ret
+}
+
+; For loading an i32 from memory into an xmm register we use pinsrd
+; instead of insertps
+define <4 x i32> @pinsrd_from_shufflevector_i32(<4 x i32> %a, <4 x i32>* nocapture readonly %pb) {
+entry:
+  %0 = load <4 x i32>* %pb, align 16
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %0, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: pinsrd_from_shufflevector_i32:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: pinsrd  $3,
+; CHECK: ret
+}
+
+define <4 x i32> @insertps_from_shufflevector_i32_2(<4 x i32> %a, <4 x i32> %b) {
+entry:
+  %vecinit6 = shufflevector <4 x i32> %a, <4 x i32> %b, <4 x i32> <i32 0, i32 7, i32 2, i32 3>
+  ret <4 x i32> %vecinit6
+; CHECK-LABEL: insertps_from_shufflevector_i32_2:
+; CHECK-NOT: shufps
+; CHECK-NOT: movaps
+; CHECK: insertps    $208,
+; CHECK: ret
+}
+
+define <4 x float> @insertps_from_load_ins_elt_undef(<4 x float> %a, float* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef:
+; CHECK-NOT: movss
+; CHECK-NOT: shufps
+; CHECK: insertps    $16,
+; CHECK: ret
+  %1 = load float* %b, align 4
+  %2 = insertelement <4 x float> undef, float %1, i32 0
+  %result = shufflevector <4 x float> %a, <4 x float> %2, <4 x i32> <i32 0, i32 4, i32 2, i32 3>
+  ret <4 x float> %result
+}
+
+define <4 x i32> @insertps_from_load_ins_elt_undef_i32(<4 x i32> %a, i32* %b) {
+; CHECK-LABEL: insertps_from_load_ins_elt_undef_i32:
+; TODO: Like on pinsrd_from_shufflevector_i32, remove this mov instr
+;; aCHECK-NOT: movd
+; CHECK-NOT: shufps
+; CHECK: insertps    $32,
+; CHECK: ret
+  %1 = load i32* %b, align 4
+  %2 = insertelement <4 x i32> undef, i32 %1, i32 0
+  %result = shufflevector <4 x i32> %a, <4 x i32> %2, <4 x i32> <i32 0, i32 1, i32 4, i32 3>
+  ret <4 x i32> %result
+}
+
+;;;;;; Shuffles optimizable with a single insertps instruction
+define <4 x float> @shuf_XYZ0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XY00(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_XYY0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext1, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  ret <4 x float> %vecinit5
+}
+
+define <4 x float> @shuf_XYW0(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_XYW0:
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext2 = extractelement <4 x float> %x, i32 3
+  %vecinit3 = insertelement <4 x float> %vecinit2, float %vecext2, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float 0.0, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_W00W(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 3
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit2 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = insertelement <4 x float> %vecinit2, float 0.0, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit3, float %vecext, i32 3
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00A(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X00X(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit1, float 0.0, i32 2
+  %vecinit4 = shufflevector <4 x float> %vecinit2, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x float> %vecinit4
+}
+
+define <4 x float> @shuf_X0YC(<4 x float> %x, <4 x float> %a) {
+; CHECK-LABEL: shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecinit1 = insertelement <4 x float> %vecinit, float 0.0, i32 1
+  %vecinit3 = shufflevector <4 x float> %vecinit1, <4 x float> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x float> %vecinit3, <4 x float> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x float> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYZ0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYZ0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $8
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext3 = extractelement <4 x i32> %x, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext3, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XY00(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XY00:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $12
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_XYY0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYY0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $104
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecinit4 = insertelement <4 x i32> %vecinit2, i32 %vecext1, i32 2
+  %vecinit5 = insertelement <4 x i32> %vecinit4, i32 0, i32 3
+  ret <4 x i32> %vecinit5
+}
+
+define <4 x i32> @i32_shuf_XYW0(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_XYW0:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $232
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecext1 = extractelement <4 x i32> %x, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 %vecext1, i32 1
+  %vecext2 = extractelement <4 x i32> %x, i32 3
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 %vecext2, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 0, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_W00W(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_W00W:
+; CHECK-NOT: pextrd
+; CHECK-NOT: punpckldq
+; CHECK: insertps    $198
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 3
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit2 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = insertelement <4 x i32> %vecinit2, i32 0, i32 2
+  %vecinit4 = insertelement <4 x i32> %vecinit3, i32 %vecext, i32 3
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00A(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00A:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X00X(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X00X:
+; CHECK-NOT: movaps
+; CHECK-NOT: shufps
+; CHECK: insertps    $48
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit2 = insertelement <4 x i32> %vecinit1, i32 0, i32 2
+  %vecinit4 = shufflevector <4 x i32> %vecinit2, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 2, i32 4>
+  ret <4 x i32> %vecinit4
+}
+
+define <4 x i32> @i32_shuf_X0YC(<4 x i32> %x, <4 x i32> %a) {
+; CHECK-LABEL: i32_shuf_X0YC:
+; CHECK: shufps
+; CHECK-NOT: movhlps
+; CHECK-NOT: shufps
+; CHECK: insertps    $176
+; CHECK: ret
+  %vecext = extractelement <4 x i32> %x, i32 0
+  %vecinit = insertelement <4 x i32> undef, i32 %vecext, i32 0
+  %vecinit1 = insertelement <4 x i32> %vecinit, i32 0, i32 1
+  %vecinit3 = shufflevector <4 x i32> %vecinit1, <4 x i32> %x, <4 x i32> <i32 0, i32 1, i32 5, i32 undef>
+  %vecinit5 = shufflevector <4 x i32> %vecinit3, <4 x i32> %a, <4 x i32> <i32 0, i32 1, i32 2, i32 6>
+  ret <4 x i32> %vecinit5
+}
+
+;; Test for a bug in the first implementation of LowerBuildVectorv4x32
+define < 4 x float> @test_insertps_no_undef(<4 x float> %x) {
+; CHECK-LABEL: test_insertps_no_undef:
+; CHECK: movaps  %xmm0, %xmm1
+; CHECK-NEXT: insertps        $8, %xmm1, %xmm1
+; CHECK-NEXT: maxps   %xmm1, %xmm0
+; CHECK-NEXT: ret
+  %vecext = extractelement <4 x float> %x, i32 0
+  %vecinit = insertelement <4 x float> undef, float %vecext, i32 0
+  %vecext1 = extractelement <4 x float> %x, i32 1
+  %vecinit2 = insertelement <4 x float> %vecinit, float %vecext1, i32 1
+  %vecext3 = extractelement <4 x float> %x, i32 2
+  %vecinit4 = insertelement <4 x float> %vecinit2, float %vecext3, i32 2
+  %vecinit5 = insertelement <4 x float> %vecinit4, float 0.0, i32 3
+  %mask = fcmp olt <4 x float> %vecinit5, %x
+  %res = select  <4 x i1> %mask, <4 x float> %x, <4 x float>%vecinit5
+  ret <4 x float> %res
+}
+
+define <8 x i16> @blendvb_fallback(<8 x i1> %mask, <8 x i16> %x, <8 x i16> %y) {
+; CHECK-LABEL: blendvb_fallback
+; CHECK: blendvb
+; CHECK: ret
+  %ret = select <8 x i1> %mask, <8 x i16> %x, <8 x i16> %y
+  ret <8 x i16> %ret
+}
+
+define <4 x float> @insertps_from_vector_load(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 48)
+  ret <4 x float> %2
+}
+
+;; Use a non-zero CountS for insertps
+define <4 x float> @insertps_from_vector_load_offset(<4 x float> %a, <4 x float>* nocapture readonly %pb) {
+; CHECK-LABEL: insertps_from_vector_load_offset:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $96, 4(%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %pb, align 16
+  %2 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %1, i32 96)
+  ret <4 x float> %2
+}
+
+define <4 x float> @insertps_from_vector_load_offset_2(<4 x float> %a, <4 x float>* nocapture readonly %pb, i64 %index) {
+; CHECK-LABEL: insertps_from_vector_load_offset_2:
+; On X32, account for the argument's move to registers
+; X32: movl    4(%esp), %eax
+; X32: movl    8(%esp), %ecx
+; CHECK-NOT: mov
+;; Try to match a bit more of the instr, since we need the load's offset.
+; CHECK: insertps    $192, 12(%{{...}},%{{...}}), %
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds <4 x float>* %pb, i64 %index
+  %2 = load <4 x float>* %1, align 16
+  %3 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %2, i32 192)
+  ret <4 x float> %3
+}
+
+define <4 x float> @insertps_from_broadcast_loadf32(<4 x float> %a, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_loadf32:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+define <4 x float> @insertps_from_broadcast_loadv4f32(<4 x float> %a, <4 x float>* nocapture readonly %b) {
+; CHECK-LABEL: insertps_from_broadcast_loadv4f32:
+; On X32, account for the arguments' move to registers
+; X32: movl    4(%esp), %{{...}}
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK-NEXT: ret
+  %1 = load <4 x float>* %b, align 4
+  %2 = extractelement <4 x float> %1, i32 0
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  ret <4 x float> %7
+}
+
+;; FIXME: We're emitting an extraneous pshufd/vbroadcast.
+define <4 x float> @insertps_from_broadcast_multiple_use(<4 x float> %a, <4 x float> %b, <4 x float> %c, <4 x float> %d, float* nocapture readonly %fb, i64 %index) {
+; CHECK-LABEL: insertps_from_broadcast_multiple_use:
+; On X32, account for the arguments' move to registers
+; X32: movl    8(%esp), %eax
+; X32: movl    4(%esp), %ecx
+; CHECK: movss
+; CHECK-NOT: mov
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: insertps    $48
+; CHECK: addps
+; CHECK: addps
+; CHECK: addps
+; CHECK-NEXT: ret
+  %1 = getelementptr inbounds float* %fb, i64 %index
+  %2 = load float* %1, align 4
+  %3 = insertelement <4 x float> undef, float %2, i32 0
+  %4 = insertelement <4 x float> %3, float %2, i32 1
+  %5 = insertelement <4 x float> %4, float %2, i32 2
+  %6 = insertelement <4 x float> %5, float %2, i32 3
+  %7 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %a, <4 x float> %6, i32 48)
+  %8 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %b, <4 x float> %6, i32 48)
+  %9 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %c, <4 x float> %6, i32 48)
+  %10 = tail call <4 x float> @llvm.x86.sse41.insertps(<4 x float> %d, <4 x float> %6, i32 48)
+  %11 = fadd <4 x float> %7, %8
+  %12 = fadd <4 x float> %9, %10
+  %13 = fadd <4 x float> %11, %12
+  ret <4 x float> %13
+}
diff --git a/test/CodeGen/X86/stack-protector-dbginfo.ll b/test/CodeGen/X86/stack-protector-dbginfo.ll
index fb7e2db..cf88ade 100644
--- a/test/CodeGen/X86/stack-protector-dbginfo.ll
+++ b/test/CodeGen/X86/stack-protector-dbginfo.ll
@@ -33,7 +33,7 @@ attributes #0 = { sspreq }
 !5 = metadata !{}
 !6 = metadata !{metadata !7}
 !7 = metadata !{i32 786472, metadata !"max_frame_size", i64 0} ; [ DW_TAG_enumerator ] [max_frame_size :: 0]
-!8 = metadata !{metadata !9}
+!8 = metadata !{metadata !9, metadata !24, metadata !41, metadata !65}
 !9 = metadata !{i32 786478, metadata !1, metadata !10, metadata !"read_response_size", metadata !"read_response_size", metadata !"_Z18read_response_sizev", i32 27, metadata !11, i1 false, i1 true, i32 0, i32 0, null, i32 256, i1 true, i32 ()* @_Z18read_response_sizev, null, null, metadata !14, i32 27} ; [ DW_TAG_subprogram ] [line 27] [def] [read_response_size]
 !10 = metadata !{i32 786473, metadata !1}         ; [ DW_TAG_file_type ] [/Users/matt/ryan_bug/<unknown>]
 !11 = metadata !{i32 786453, i32 0, null, metadata !"", i32 0, i64 0, i64 0, i64 0, i32 0, null, metadata !12, i32 0, null, null, null} ; [ DW_TAG_subroutine_type ] [line 0, size 0, align 0, offset 0] [from ]
diff --git a/test/CodeGen/X86/stack-protector.ll b/test/CodeGen/X86/stack-protector.ll
index 265ec80..4db0f9a 100644
--- a/test/CodeGen/X86/stack-protector.ll
+++ b/test/CodeGen/X86/stack-protector.ll
@@ -16,13 +16,14 @@
 %struct.anon.0 = type { %union.anon.1 }
 %union.anon.1 = type { [2 x i8] }
 %struct.small = type { i8 }
+%struct.small_char = type { i32, [5 x i8] }
 
 @.str = private unnamed_addr constant [4 x i8] c"%s\0A\00", align 1
 
 ; test1a: array of [16 x i8] 
 ;         no ssp attribute
 ; Requires no protector.
-define void @test1a(i8* %a) nounwind uwtable {
+define void @test1a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test1a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -53,7 +54,8 @@ entry:
 ; test1b: array of [16 x i8] 
 ;         ssp attribute
 ; Requires protector.
-define void @test1b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test1b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test1b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -88,7 +90,8 @@ entry:
 ; test1c: array of [16 x i8] 
 ;         sspstrong attribute
 ; Requires protector.
-define void @test1c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test1c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test1c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -119,7 +122,8 @@ entry:
 ; test1d: array of [16 x i8] 
 ;         sspreq attribute
 ; Requires protector.
-define void @test1d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test1d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test1d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -150,7 +154,7 @@ entry:
 ; test2a: struct { [16 x i8] }
 ;         no ssp attribute
 ; Requires no protector.
-define void @test2a(i8* %a) nounwind uwtable {
+define void @test2a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test2a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -183,7 +187,8 @@ entry:
 ; test2b: struct { [16 x i8] }
 ;          ssp attribute
 ; Requires protector.
-define void @test2b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test2b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test2b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -216,7 +221,8 @@ entry:
 ; test2c: struct { [16 x i8] }
 ;          sspstrong attribute
 ; Requires protector.
-define void @test2c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test2c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test2c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -249,7 +255,8 @@ entry:
 ; test2d: struct { [16 x i8] }
 ;          sspreq attribute
 ; Requires protector.
-define void @test2d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test2d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test2d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -282,7 +289,7 @@ entry:
 ; test3a:  array of [4 x i8]
 ;          no ssp attribute
 ; Requires no protector.
-define void @test3a(i8* %a) nounwind uwtable {
+define void @test3a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test3a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -313,7 +320,8 @@ entry:
 ; test3b:  array [4 x i8]
 ;          ssp attribute
 ; Requires no protector.
-define void @test3b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test3b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test3b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -344,7 +352,8 @@ entry:
 ; test3c:  array of [4 x i8]
 ;          sspstrong attribute
 ; Requires protector.
-define void @test3c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test3c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test3c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -375,7 +384,8 @@ entry:
 ; test3d:  array of [4 x i8]
 ;          sspreq attribute
 ; Requires protector.
-define void @test3d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test3d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test3d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -406,7 +416,7 @@ entry:
 ; test4a:  struct { [4 x i8] }
 ;          no ssp attribute
 ; Requires no protector.
-define void @test4a(i8* %a) nounwind uwtable {
+define void @test4a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test4a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -439,7 +449,8 @@ entry:
 ; test4b:  struct { [4 x i8] }
 ;          ssp attribute
 ; Requires no protector.
-define void @test4b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test4b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test4b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -472,7 +483,8 @@ entry:
 ; test4c:  struct { [4 x i8] }
 ;          sspstrong attribute
 ; Requires protector.
-define void @test4c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test4c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test4c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -505,7 +517,8 @@ entry:
 ; test4d:  struct { [4 x i8] }
 ;          sspreq attribute
 ; Requires protector.
-define void @test4d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test4d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test4d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -538,7 +551,7 @@ entry:
 ; test5a:  no arrays / no nested arrays
 ;          no ssp attribute
 ; Requires no protector.
-define void @test5a(i8* %a) nounwind uwtable {
+define void @test5a(i8* %a) {
 entry:
 ; LINUX-I386-LABEL: test5a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -565,7 +578,8 @@ entry:
 ; test5b:  no arrays / no nested arrays
 ;          ssp attribute
 ; Requires no protector.
-define void @test5b(i8* %a) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test5b(i8* %a) #0 {
 entry:
 ; LINUX-I386-LABEL: test5b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -592,7 +606,8 @@ entry:
 ; test5c:  no arrays / no nested arrays
 ;          sspstrong attribute
 ; Requires no protector.
-define void @test5c(i8* %a) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test5c(i8* %a) #1 {
 entry:
 ; LINUX-I386-LABEL: test5c:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -619,7 +634,8 @@ entry:
 ; test5d:  no arrays / no nested arrays
 ;          sspreq attribute
 ; Requires protector.
-define void @test5d(i8* %a) nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test5d(i8* %a) #2 {
 entry:
 ; LINUX-I386-LABEL: test5d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -646,7 +662,7 @@ entry:
 ; test6a:  Address-of local taken (j = &a)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test6a() nounwind uwtable {
+define void @test6a() {
 entry:
 ; LINUX-I386-LABEL: test6a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -677,7 +693,8 @@ entry:
 ; test6b:  Address-of local taken (j = &a)
 ;          ssp attribute
 ; Requires no protector.
-define void @test6b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test6b() #0 {
 entry:
 ; LINUX-I386-LABEL: test6b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -708,7 +725,8 @@ entry:
 ; test6c:  Address-of local taken (j = &a)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test6c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test6c() #1 {
 entry:
 ; LINUX-I386-LABEL: test6c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -739,7 +757,8 @@ entry:
 ; test6d:  Address-of local taken (j = &a)
 ;          sspreq attribute
 ; Requires protector.
-define void @test6d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test6d() #2 {
 entry:
 ; LINUX-I386-LABEL: test6d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -770,7 +789,7 @@ entry:
 ; test7a:  PtrToInt Cast
 ;          no ssp attribute
 ; Requires no protector.
-define void @test7a() nounwind uwtable readnone {
+define void @test7a()  {
 entry:
 ; LINUX-I386-LABEL: test7a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -796,7 +815,8 @@ entry:
 ; test7b:  PtrToInt Cast
 ;          ssp attribute
 ; Requires no protector.
-define void @test7b() nounwind uwtable readnone ssp {
+; Function Attrs: ssp 
+define void @test7b() #0 {
 entry:
 ; LINUX-I386-LABEL: test7b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -822,7 +842,8 @@ entry:
 ; test7c:  PtrToInt Cast
 ;          sspstrong attribute
 ; Requires protector.
-define void @test7c() nounwind uwtable readnone sspstrong {
+; Function Attrs: sspstrong 
+define void @test7c() #1 {
 entry:
 ; LINUX-I386-LABEL: test7c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -848,7 +869,8 @@ entry:
 ; test7d:  PtrToInt Cast
 ;          sspreq attribute
 ; Requires protector.
-define void @test7d() nounwind uwtable readnone sspreq {
+; Function Attrs: sspreq 
+define void @test7d() #2 {
 entry:
 ; LINUX-I386-LABEL: test7d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -874,7 +896,7 @@ entry:
 ; test8a:  Passing addr-of to function call
 ;          no ssp attribute
 ; Requires no protector.
-define void @test8a() nounwind uwtable {
+define void @test8a() {
 entry:
 ; LINUX-I386-LABEL: test8a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -892,14 +914,15 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8b:  Passing addr-of to function call
 ;          ssp attribute
 ; Requires no protector.
-define void @test8b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test8b() #0 {
 entry:
 ; LINUX-I386-LABEL: test8b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -917,14 +940,15 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8c:  Passing addr-of to function call
 ;          sspstrong attribute
 ; Requires protector.
-define void @test8c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test8c() #1 {
 entry:
 ; LINUX-I386-LABEL: test8c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -942,14 +966,15 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test8d:  Passing addr-of to function call
 ;          sspreq attribute
 ; Requires protector.
-define void @test8d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test8d() #2 {
 entry:
 ; LINUX-I386-LABEL: test8d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -967,14 +992,14 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %b = alloca i32, align 4
-  call void @funcall(i32* %b) nounwind
+  call void @funcall(i32* %b)
   ret void
 }
 
 ; test9a:  Addr-of in select instruction
 ;          no ssp attribute
 ; Requires no protector.
-define void @test9a() nounwind uwtable {
+define void @test9a() {
 entry:
 ; LINUX-I386-LABEL: test9a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -992,7 +1017,7 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1003,7 +1028,8 @@ entry:
 ; test9b:  Addr-of in select instruction
 ;          ssp attribute
 ; Requires no protector.
-define void @test9b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test9b() #0 {
 entry:
 ; LINUX-I386-LABEL: test9b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1021,7 +1047,7 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1032,7 +1058,8 @@ entry:
 ; test9c:  Addr-of in select instruction
 ;          sspstrong attribute
 ; Requires protector.
-define void @test9c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test9c() #1 {
 entry:
 ; LINUX-I386-LABEL: test9c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1050,7 +1077,7 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1061,7 +1088,8 @@ entry:
 ; test9d:  Addr-of in select instruction
 ;          sspreq attribute
 ; Requires protector.
-define void @test9d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test9d() #2 {
 entry:
 ; LINUX-I386-LABEL: test9d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1079,7 +1107,7 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp2 = fcmp ogt double %call, 0.000000e+00
   %y.1 = select i1 %cmp2, double* %x, double* null
@@ -1090,7 +1118,7 @@ entry:
 ; test10a: Addr-of in phi instruction
 ;          no ssp attribute
 ; Requires no protector.
-define void @test10a() nounwind uwtable {
+define void @test10a() {
 entry:
 ; LINUX-I386-LABEL: test10a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1108,13 +1136,13 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1127,14 +1155,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10b: Addr-of in phi instruction
 ;          ssp attribute
 ; Requires no protector.
-define void @test10b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test10b() #0 {
 entry:
 ; LINUX-I386-LABEL: test10b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1152,13 +1181,13 @@ entry:
 ; DARWIN-X64-NOT: callq ___stack_chk_fail
 ; DARWIN-X64: .cfi_endproc
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1171,14 +1200,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10c: Addr-of in phi instruction
 ;          sspstrong attribute
 ; Requires protector.
-define void @test10c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test10c() #1 {
 entry:
 ; LINUX-I386-LABEL: test10c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1196,13 +1226,13 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1215,14 +1245,15 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test10d: Addr-of in phi instruction
 ;          sspreq attribute
 ; Requires protector.
-define void @test10d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test10d() #2 {
 entry:
 ; LINUX-I386-LABEL: test10d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1240,13 +1271,13 @@ entry:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %x = alloca double, align 8
-  %call = call double @testi_aux() nounwind
+  %call = call double @testi_aux()
   store double %call, double* %x, align 8
   %cmp = fcmp ogt double %call, 3.140000e+00
   br i1 %cmp, label %if.then, label %if.else
 
 if.then:                                          ; preds = %entry
-  %call1 = call double @testi_aux() nounwind
+  %call1 = call double @testi_aux()
   store double %call1, double* %x, align 8
   br label %if.end4
 
@@ -1259,14 +1290,14 @@ if.then3:                                         ; preds = %if.else
 
 if.end4:                                          ; preds = %if.else, %if.then3, %if.then
   %y.0 = phi double* [ null, %if.then ], [ %x, %if.then3 ], [ null, %if.else ]
-  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0) nounwind
+  %call5 = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), double* %y.0)
   ret void
 }
 
 ; test11a: Addr-of struct element. (GEP followed by store).
 ;          no ssp attribute
 ; Requires no protector.
-define void @test11a() nounwind uwtable {
+define void @test11a() {
 entry:
 ; LINUX-I386-LABEL: test11a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1295,7 +1326,8 @@ entry:
 ; test11b: Addr-of struct element. (GEP followed by store).
 ;          ssp attribute
 ; Requires no protector.
-define void @test11b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test11b() #0 {
 entry:
 ; LINUX-I386-LABEL: test11b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1324,7 +1356,8 @@ entry:
 ; test11c: Addr-of struct element. (GEP followed by store).
 ;          sspstrong attribute
 ; Requires protector.
-define void @test11c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test11c() #1 {
 entry:
 ; LINUX-I386-LABEL: test11c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1353,7 +1386,8 @@ entry:
 ; test11d: Addr-of struct element. (GEP followed by store).
 ;          sspreq attribute
 ; Requires protector.
-define void @test11d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test11d() #2 {
 entry:
 ; LINUX-I386-LABEL: test11d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1382,7 +1416,7 @@ entry:
 ; test12a: Addr-of struct element, GEP followed by ptrtoint.
 ;          no ssp attribute
 ; Requires no protector.
-define void @test12a() nounwind uwtable {
+define void @test12a() {
 entry:
 ; LINUX-I386-LABEL: test12a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1410,7 +1444,8 @@ entry:
 ; test12b: Addr-of struct element, GEP followed by ptrtoint.
 ;          ssp attribute
 ; Requires no protector.
-define void @test12b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test12b() #0 {
 entry:
 ; LINUX-I386-LABEL: test12b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1437,8 +1472,8 @@ entry:
 
 ; test12c: Addr-of struct element, GEP followed by ptrtoint.
 ;          sspstrong attribute
-; Requires protector.
-define void @test12c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test12c() #1 {
 entry:
 ; LINUX-I386-LABEL: test12c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1466,7 +1501,8 @@ entry:
 ; test12d: Addr-of struct element, GEP followed by ptrtoint.
 ;          sspreq attribute
 ; Requires protector.
-define void @test12d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test12d() #2 {
 entry:
 ; LINUX-I386-LABEL: test12d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1494,7 +1530,7 @@ entry:
 ; test13a: Addr-of struct element, GEP followed by callinst.
 ;          no ssp attribute
 ; Requires no protector.
-define void @test13a() nounwind uwtable {
+define void @test13a() {
 entry:
 ; LINUX-I386-LABEL: test13a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1513,14 +1549,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13b: Addr-of struct element, GEP followed by callinst.
 ;          ssp attribute
 ; Requires no protector.
-define void @test13b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test13b() #0 {
 entry:
 ; LINUX-I386-LABEL: test13b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1539,14 +1576,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13c: Addr-of struct element, GEP followed by callinst.
 ;          sspstrong attribute
 ; Requires protector.
-define void @test13c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test13c() #1 {
 entry:
 ; LINUX-I386-LABEL: test13c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1565,14 +1603,15 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test13d: Addr-of struct element, GEP followed by callinst.
 ;          sspreq attribute
 ; Requires protector.
-define void @test13d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test13d() #2 {
 entry:
 ; LINUX-I386-LABEL: test13d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1591,14 +1630,14 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %c = alloca %struct.pair, align 4
   %y = getelementptr inbounds %struct.pair* %c, i64 0, i32 1
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %y)
   ret void
 }
 
 ; test14a: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test14a() nounwind uwtable {
+define void @test14a() {
 entry:
 ; LINUX-I386-LABEL: test14a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1617,14 +1656,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14b: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          ssp attribute
 ; Requires no protector.
-define void @test14b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test14b() #0 {
 entry:
 ; LINUX-I386-LABEL: test14b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1643,14 +1683,15 @@ entry:
 ; DARWIN-X64: .cfi_endproc
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14c: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test14c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test14c() #1 {
 entry:
 ; LINUX-I386-LABEL: test14c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1669,14 +1710,15 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
 ; test14d: Addr-of a local, optimized into a GEP (e.g., &a - 12)
 ;          sspreq  attribute
 ; Requires protector.
-define void @test14d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test14d() #2 {
 entry:
 ; LINUX-I386-LABEL: test14d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1695,7 +1737,7 @@ entry:
 ; DARWIN-X64: callq ___stack_chk_fail
   %a = alloca i32, align 4
   %add.ptr5 = getelementptr inbounds i32* %a, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), i32* %add.ptr5)
   ret void
 }
 
@@ -1703,7 +1745,7 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test15a() nounwind uwtable {
+define void @test15a() {
 entry:
 ; LINUX-I386-LABEL: test15a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1734,7 +1776,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          ssp attribute
 ; Requires no protector.
-define void @test15b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test15b() #0 {
 entry:
 ; LINUX-I386-LABEL: test15b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1765,7 +1808,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test15c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test15c() #1 {
 entry:
 ; LINUX-I386-LABEL: test15c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1796,7 +1840,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspreq attribute
 ; Requires protector.
-define void @test15d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test15d() #2 {
 entry:
 ; LINUX-I386-LABEL: test15d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1827,7 +1872,7 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          no ssp attribute
 ; Requires no protector.
-define void @test16a() nounwind uwtable {
+define void @test16a() {
 entry:
 ; LINUX-I386-LABEL: test16a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1847,7 +1892,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1855,7 +1900,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          ssp attribute
 ; Requires no protector.
-define void @test16b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test16b() #0 {
 entry:
 ; LINUX-I386-LABEL: test16b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1875,7 +1921,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1883,7 +1929,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspstrong attribute
 ; Requires protector.
-define void @test16c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test16c() #1 {
 entry:
 ; LINUX-I386-LABEL: test16c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1903,7 +1950,7 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
@@ -1911,7 +1958,8 @@ entry:
 ;           (e.g., int a; ... ; float *b = &a;)
 ;          sspreq attribute
 ; Requires protector.
-define void @test16d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test16d() #2 {
 entry:
 ; LINUX-I386-LABEL: test16d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -1931,14 +1979,14 @@ entry:
   %a = alloca i32, align 4
   store i32 0, i32* %a, align 4
   %0 = bitcast i32* %a to float*
-  call void @funfloat(float* %0) nounwind
+  call void @funfloat(float* %0)
   ret void
 }
 
 ; test17a: Addr-of a vector nested in a struct
 ;          no ssp attribute
 ; Requires no protector.
-define void @test17a() nounwind uwtable {
+define void @test17a() {
 entry:
 ; LINUX-I386-LABEL: test17a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1958,14 +2006,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17b: Addr-of a vector nested in a struct
 ;          ssp attribute
 ; Requires no protector.
-define void @test17b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test17b() #0 {
 entry:
 ; LINUX-I386-LABEL: test17b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -1985,14 +2034,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17c: Addr-of a vector nested in a struct
 ;          sspstrong attribute
 ; Requires protector.
-define void @test17c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test17c() #1 {
 entry:
 ; LINUX-I386-LABEL: test17c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2012,14 +2062,15 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test17d: Addr-of a vector nested in a struct
 ;          sspreq attribute
 ; Requires protector.
-define void @test17d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test17d() #2 {
 entry:
 ; LINUX-I386-LABEL: test17d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2039,14 +2090,14 @@ entry:
   %c = alloca %struct.vec, align 16
   %y = getelementptr inbounds %struct.vec* %c, i64 0, i32 0
   %add.ptr = getelementptr inbounds <4 x i32>* %y, i64 -12
-  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr) nounwind
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i64 0, i64 0), <4 x i32>* %add.ptr)
   ret void
 }
 
 ; test18a: Addr-of a variable passed into an invoke instruction.
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test18a() uwtable {
+define i32 @test18a()  {
 entry:
 ; LINUX-I386-LABEL: test18a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2082,7 +2133,8 @@ lpad:
 ; test18b: Addr-of a variable passed into an invoke instruction.
 ;          ssp attribute
 ; Requires no protector.
-define i32 @test18b() uwtable ssp {
+; Function Attrs: ssp 
+define i32 @test18b() #0 {
 entry:
 ; LINUX-I386-LABEL: test18b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2118,7 +2170,8 @@ lpad:
 ; test18c: Addr-of a variable passed into an invoke instruction.
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test18c() uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test18c() #1 {
 entry:
 ; LINUX-I386-LABEL: test18c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2154,7 +2207,8 @@ lpad:
 ; test18d: Addr-of a variable passed into an invoke instruction.
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test18d() uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test18d() #2 {
 entry:
 ; LINUX-I386-LABEL: test18d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2186,12 +2240,11 @@ lpad:
           catch i8* null
   ret i32 0
 }
-
 ; test19a: Addr-of a struct element passed into an invoke instruction.
 ;           (GEP followed by an invoke)
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test19a() uwtable {
+define i32 @test19a()  {
 entry:
 ; LINUX-I386-LABEL: test19a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2230,7 +2283,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          ssp attribute
 ; Requires no protector.
-define i32 @test19b() uwtable ssp {
+; Function Attrs: ssp 
+define i32 @test19b() #0 {
 entry:
 ; LINUX-I386-LABEL: test19b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2269,7 +2323,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test19c() uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test19c() #1 {
 entry:
 ; LINUX-I386-LABEL: test19c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2308,7 +2363,8 @@ lpad:
 ;           (GEP followed by an invoke)
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test19d() uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test19d() #2 {
 entry:
 ; LINUX-I386-LABEL: test19d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2350,7 +2406,7 @@ lpad:
 ; test20a: Addr-of a pointer
 ;          no ssp attribute
 ; Requires no protector.
-define void @test20a() nounwind uwtable {
+define void @test20a() {
 entry:
 ; LINUX-I386-LABEL: test20a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2380,7 +2436,8 @@ entry:
 ; test20b: Addr-of a pointer
 ;          ssp attribute
 ; Requires no protector.
-define void @test20b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test20b() #0 {
 entry:
 ; LINUX-I386-LABEL: test20b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2410,7 +2467,8 @@ entry:
 ; test20c: Addr-of a pointer
 ;          sspstrong attribute
 ; Requires protector.
-define void @test20c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test20c() #1 {
 entry:
 ; LINUX-I386-LABEL: test20c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2440,7 +2498,8 @@ entry:
 ; test20d: Addr-of a pointer
 ;          sspreq attribute
 ; Requires protector.
-define void @test20d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test20d() #2 {
 entry:
 ; LINUX-I386-LABEL: test20d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2470,7 +2529,7 @@ entry:
 ; test21a: Addr-of a casted pointer
 ;          no ssp attribute
 ; Requires no protector.
-define void @test21a() nounwind uwtable {
+define void @test21a() {
 entry:
 ; LINUX-I386-LABEL: test21a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2501,7 +2560,8 @@ entry:
 ; test21b: Addr-of a casted pointer
 ;          ssp attribute
 ; Requires no protector.
-define void @test21b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test21b() #0 {
 entry:
 ; LINUX-I386-LABEL: test21b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2532,7 +2592,8 @@ entry:
 ; test21c: Addr-of a casted pointer
 ;          sspstrong attribute
 ; Requires protector.
-define void @test21c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test21c() #1 {
 entry:
 ; LINUX-I386-LABEL: test21c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2563,7 +2624,8 @@ entry:
 ; test21d: Addr-of a casted pointer
 ;          sspreq attribute
 ; Requires protector.
-define void @test21d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define void @test21d() #2 {
 entry:
 ; LINUX-I386-LABEL: test21d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2594,7 +2656,7 @@ entry:
 ; test22a: [2 x i8] in a class
 ;          no ssp attribute
 ; Requires no protector.
-define signext i8 @test22a() nounwind uwtable {
+define signext i8 @test22a() {
 entry:
 ; LINUX-I386-LABEL: test22a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2621,7 +2683,8 @@ entry:
 ; test22b: [2 x i8] in a class
 ;          ssp attribute
 ; Requires no protector.
-define signext i8 @test22b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define signext i8 @test22b() #0 {
 entry:
 ; LINUX-I386-LABEL: test22b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2648,7 +2711,8 @@ entry:
 ; test22c: [2 x i8] in a class
 ;          sspstrong attribute
 ; Requires protector.
-define signext i8 @test22c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define signext i8 @test22c() #1 {
 entry:
 ; LINUX-I386-LABEL: test22c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2675,7 +2739,8 @@ entry:
 ; test22d: [2 x i8] in a class
 ;          sspreq attribute
 ; Requires protector.
-define signext i8 @test22d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define signext i8 @test22d() #2 {
 entry:
 ; LINUX-I386-LABEL: test22d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2702,7 +2767,7 @@ entry:
 ; test23a: [2 x i8] nested in several layers of structs and unions
 ;          no ssp attribute
 ; Requires no protector.
-define signext i8 @test23a() nounwind uwtable {
+define signext i8 @test23a() {
 entry:
 ; LINUX-I386-LABEL: test23a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2733,7 +2798,8 @@ entry:
 ; test23b: [2 x i8] nested in several layers of structs and unions
 ;          ssp attribute
 ; Requires no protector.
-define signext i8 @test23b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define signext i8 @test23b() #0 {
 entry:
 ; LINUX-I386-LABEL: test23b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2764,7 +2830,8 @@ entry:
 ; test23c: [2 x i8] nested in several layers of structs and unions
 ;          sspstrong attribute
 ; Requires protector.
-define signext i8 @test23c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define signext i8 @test23c() #1 {
 entry:
 ; LINUX-I386-LABEL: test23c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2795,7 +2862,8 @@ entry:
 ; test23d: [2 x i8] nested in several layers of structs and unions
 ;          sspreq attribute
 ; Requires protector.
-define signext i8 @test23d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define signext i8 @test23d() #2 {
 entry:
 ; LINUX-I386-LABEL: test23d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2826,7 +2894,7 @@ entry:
 ; test24a: Variable sized alloca
 ;          no ssp attribute
 ; Requires no protector.
-define void @test24a(i32 %n) nounwind uwtable {
+define void @test24a(i32 %n) {
 entry:
 ; LINUX-I386-LABEL: test24a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2857,7 +2925,8 @@ entry:
 ; test24b: Variable sized alloca
 ;          ssp attribute
 ; Requires protector.
-define void @test24b(i32 %n) nounwind uwtable ssp {
+; Function Attrs: ssp
+define void @test24b(i32 %n) #0 {
 entry:
 ; LINUX-I386-LABEL: test24b:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2888,7 +2957,8 @@ entry:
 ; test24c: Variable sized alloca
 ;          sspstrong attribute
 ; Requires protector.
-define void @test24c(i32 %n) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test24c(i32 %n) #1 {
 entry:
 ; LINUX-I386-LABEL: test24c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2919,7 +2989,8 @@ entry:
 ; test24d: Variable sized alloca
 ;          sspreq attribute
 ; Requires protector.
-define void @test24d(i32 %n) nounwind uwtable sspreq  {
+; Function Attrs: sspreq 
+define void @test24d(i32 %n) #2 {
 entry:
 ; LINUX-I386-LABEL: test24d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -2950,7 +3021,7 @@ entry:
 ; test25a: array of [4 x i32]
 ;          no ssp attribute
 ; Requires no protector.
-define i32 @test25a() nounwind uwtable {
+define i32 @test25a() {
 entry:
 ; LINUX-I386-LABEL: test25a:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -2976,7 +3047,8 @@ entry:
 ; test25b: array of [4 x i32]
 ;          ssp attribute
 ; Requires no protector, except for Darwin which _does_ require a protector.
-define i32 @test25b() nounwind uwtable ssp {
+; Function Attrs: ssp
+define i32 @test25b() #0 {
 entry:
 ; LINUX-I386-LABEL: test25b:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -3002,7 +3074,8 @@ entry:
 ; test25c: array of [4 x i32]
 ;          sspstrong attribute
 ; Requires protector.
-define i32 @test25c() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test25c() #1 {
 entry:
 ; LINUX-I386-LABEL: test25c:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3028,7 +3101,8 @@ entry:
 ; test25d: array of [4 x i32]
 ;          sspreq attribute
 ; Requires protector.
-define i32 @test25d() nounwind uwtable sspreq {
+; Function Attrs: sspreq 
+define i32 @test25d() #2 {
 entry:
 ; LINUX-I386-LABEL: test25d:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3056,7 +3130,8 @@ entry:
 ;         a stack protector.
 ;         ssptrong attribute
 ; Requires no protector.
-define void @test26() nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define void @test26() #1 {
 entry:
 ; LINUX-I386-LABEL: test26:
 ; LINUX-I386-NOT: calll __stack_chk_fail
@@ -3087,7 +3162,8 @@ entry:
 ;         Verify that the address-of analysis does not get stuck in infinite
 ;         recursion when chasing the alloca through the PHI nodes.
 ; Requires protector.
-define i32 @test27(i32 %arg) nounwind uwtable sspstrong {
+; Function Attrs: sspstrong 
+define i32 @test27(i32 %arg) #1 {
 bb:
 ; LINUX-I386-LABEL: test27:
 ; LINUX-I386: mov{{l|q}} %gs:
@@ -3105,7 +3181,7 @@ bb:
 ; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
 ; DARWIN-X64: callq ___stack_chk_fail
   %tmp = alloca %struct.small*, align 8
-  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp) nounwind
+  %tmp1 = call i32 (...)* @dummy(%struct.small** %tmp)
   %tmp2 = load %struct.small** %tmp, align 8
   %tmp3 = ptrtoint %struct.small* %tmp2 to i64
   %tmp4 = trunc i64 %tmp3 to i32
@@ -3133,10 +3209,239 @@ bb17:                                             ; preds = %bb6
 
 bb21:                                             ; preds = %bb6, %bb
   %tmp22 = phi i32 [ %tmp1, %bb ], [ %tmp14, %bb6 ]
-  %tmp23 = call i32 (...)* @dummy(i32 %tmp22) nounwind
+  %tmp23 = call i32 (...)* @dummy(i32 %tmp22)
   ret i32 undef
 }
 
+; test28a: An array of [32 x i8] and a requested ssp-buffer-size of 33.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=33
+define i32 @test28a() #3 {
+entry:
+; LINUX-I386-LABEL: test28a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test28a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test28a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test28a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca [32 x i8], align 16
+  %arraydecay = getelementptr inbounds [32 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test28b: An array of [33 x i8] and a requested ssp-buffer-size of 33.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=33
+define i32 @test28b() #3 {
+entry:
+; LINUX-I386-LABEL: test28b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test28b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test28b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test28b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca [33 x i8], align 16
+  %arraydecay = getelementptr inbounds [33 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test29a: An array of [4 x i8] and a requested ssp-buffer-size of 5.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test29a() #4 {
+entry:
+; LINUX-I386-LABEL: test29a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test29a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test29a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test29a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca [4 x i8], align 1
+  %arraydecay = getelementptr inbounds [4 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test29b: An array of [5 x i8] and a requested ssp-buffer-size of 5.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test29b() #4 {
+entry:
+; LINUX-I386-LABEL: test29b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test29b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test29b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test29b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca [5 x i8], align 1
+  %arraydecay = getelementptr inbounds [5 x i8]* %test, i32 0, i32 0
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %arraydecay)
+  ret i32 %call
+}
+
+; test30a: An structure containing an i32 and an array of [5 x i8].
+;          Requested ssp-buffer-size of 6.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=6
+define i32 @test30a() #5 {
+entry:
+; LINUX-I386-LABEL: test30a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test30a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test30a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test30a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca %struct.small_char, align 4
+  %test.coerce = alloca { i64, i8 }
+  %0 = bitcast { i64, i8 }* %test.coerce to i8*
+  %1 = bitcast %struct.small_char* %test to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 0, i1 false)
+  %2 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  %4 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 1
+  %5 = load i8* %4, align 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  ret i32 %call
+}
+
+; test30b: An structure containing an i32 and an array of [5 x i8].
+;          Requested ssp-buffer-size of 5.
+; Requires protector.
+; Function Attrs: ssp stack-protector-buffer-size=5
+define i32 @test30b() #4 {
+entry:
+; LINUX-I386-LABEL: test30b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test30b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test30b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test30b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca %struct.small_char, align 4
+  %test.coerce = alloca { i64, i8 }
+  %0 = bitcast { i64, i8 }* %test.coerce to i8*
+  %1 = bitcast %struct.small_char* %test to i8*
+  call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 12, i32 0, i1 false)
+  %2 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 0
+  %3 = load i64* %2, align 1
+  %4 = getelementptr { i64, i8 }* %test.coerce, i32 0, i32 1
+  %5 = load i8* %4, align 1
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i64 %3, i8 %5)
+  ret i32 %call
+}
+
+; test31a: An alloca of size 5.
+;          Requested ssp-buffer-size of 6.
+; Requires no protector.
+; Function Attrs: ssp stack-protector-buffer-size=6
+define i32 @test31a() #5 {
+entry:
+; LINUX-I386-LABEL: test31a:
+; LINUX-I386-NOT: calll __stack_chk_fail
+; LINUX-I386: .cfi_endproc
+
+; LINUX-X64-LABEL: test31a:
+; LINUX-X64-NOT: callq __stack_chk_fail
+; LINUX-X64: .cfi_endproc
+
+; LINUX-KERNEL-X64-LABEL: test31a:
+; LINUX-KERNEL-X64-NOT: callq __stack_chk_fail
+; LINUX-KERNEL-X64: .cfi_endproc
+
+; DARWIN-X64-LABEL: test31a:
+; DARWIN-X64-NOT: callq ___stack_chk_fail
+; DARWIN-X64: .cfi_endproc
+  %test = alloca i8*, align 8
+  %0 = alloca i8, i64 4
+  store i8* %0, i8** %test, align 8
+  %1 = load i8** %test, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  ret i32 %call
+}
+
+; test31b: An alloca of size 5.
+;          Requested ssp-buffer-size of 5.
+; Requires protector.
+define i32 @test31b() #4 {
+entry:
+; LINUX-I386-LABEL: test31b:
+; LINUX-I386: mov{{l|q}} %gs:
+; LINUX-I386: calll __stack_chk_fail
+
+; LINUX-X64-LABEL: test31b:
+; LINUX-X64: mov{{l|q}} %fs:
+; LINUX-X64: callq __stack_chk_fail
+
+; LINUX-KERNEL-X64-LABEL: test31b:
+; LINUX-KERNEL-X64: mov{{l|q}} %gs:
+; LINUX-KERNEL-X64: callq __stack_chk_fail
+
+; DARWIN-X64-LABEL: test31b:
+; DARWIN-X64: mov{{l|q}} ___stack_chk_guard
+; DARWIN-X64: callq ___stack_chk_fail
+  %test = alloca i8*, align 8
+  %0 = alloca i8, i64 5
+  store i8* %0, i8** %test, align 8
+  %1 = load i8** %test, align 8
+  %call = call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([4 x i8]* @.str, i32 0, i32 0), i8* %1)
+  ret i32 %call
+}
+
 declare double @testi_aux()
 declare i8* @strcpy(i8*, i8*)
 declare i32 @printf(i8*, ...)
@@ -3148,3 +3453,11 @@ declare void @_Z3exceptPi(i32*)
 declare i32 @__gxx_personality_v0(...)
 declare i32* @getp()
 declare i32 @dummy(...)
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture readonly, i64, i32, i1)
+
+attributes #0 = { ssp }
+attributes #1 = { sspstrong }
+attributes #2 = { sspreq }
+attributes #3 = { ssp "stack-protector-buffer-size"="33" }
+attributes #4 = { ssp "stack-protector-buffer-size"="5" }
+attributes #5 = { ssp "stack-protector-buffer-size"="6" }
diff --git a/test/CodeGen/X86/stackpointer.ll b/test/CodeGen/X86/stackpointer.ll
new file mode 100644
index 0000000..80bcfbf
--- /dev/null
+++ b/test/CodeGen/X86/stackpointer.ll
@@ -0,0 +1,28 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin  | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-linux-gnueabi | FileCheck %s
+; RUN: opt < %s -O3 -S -mtriple=x86_64-linux-gnueabi | FileCheck %s --check-prefix=OPT
+
+define i64 @get_stack() nounwind {
+entry:
+; CHECK-LABEL: get_stack:
+; CHECK: movq	%rsp, %rax
+	%sp = call i64 @llvm.read_register.i64(metadata !0)
+; OPT: @llvm.read_register.i64
+  ret i64 %sp
+}
+
+define void @set_stack(i64 %val) nounwind {
+entry:
+; CHECK-LABEL: set_stack:
+; CHECK: movq	%rdi, %rsp
+  call void @llvm.write_register.i64(metadata !0, i64 %val)
+; OPT: @llvm.write_register.i64
+  ret void
+}
+
+declare i64 @llvm.read_register.i64(metadata) nounwind
+declare void @llvm.write_register.i64(metadata, i64) nounwind
+
+; register unsigned long current_stack_pointer asm("rsp");
+; CHECK-NOT: .asciz  "rsp"
+!0 = metadata !{metadata !"rsp\00"}
diff --git a/test/CodeGen/X86/tls.ll b/test/CodeGen/X86/tls.ll
index 76a8402..75e7fc4 100644
--- a/test/CodeGen/X86/tls.ll
+++ b/test/CodeGen/X86/tls.ll
@@ -2,6 +2,8 @@
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-linux-gnu | FileCheck -check-prefix=X64_LINUX %s
 ; RUN: llc < %s -march=x86 -mtriple=x86-pc-win32 | FileCheck -check-prefix=X32_WIN %s
 ; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-win32 | FileCheck -check-prefix=X64_WIN %s
+; RUN: llc < %s -march=x86 -mtriple=x86-pc-windows-gnu | FileCheck -check-prefix=MINGW32 %s
+; RUN: llc < %s -march=x86-64 -mtriple=x86_64-pc-windows-gnu | FileCheck -check-prefix=X64_WIN %s
 
 @i1 = thread_local global i32 15
 @i2 = external thread_local global i32
@@ -30,6 +32,12 @@ define i32 @f1() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f1:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i1
@@ -57,6 +65,12 @@ define i32* @f2() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i1@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f2:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i1
@@ -83,6 +97,12 @@ define i32 @f3() nounwind {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i2@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f3:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i2@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i2
@@ -110,6 +130,12 @@ define i32* @f4() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i2@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f4:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i2@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i2
@@ -134,6 +160,12 @@ define i32 @f5() nounwind {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movl i3@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f5:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i3@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i3
@@ -161,6 +193,12 @@ define i32* @f6() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: leaq i3@SECREL32(%rax), %rax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f6:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i3@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i3
@@ -173,6 +211,12 @@ define i32 @f7() {
 ; X64_LINUX-LABEL: f7:
 ; X64_LINUX:      movl %fs:i4@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f7:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i4@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i4
@@ -188,6 +232,12 @@ define i32* @f8() {
 ; X64_LINUX:      movq %fs:0, %rax
 ; X64_LINUX-NEXT: leaq i4@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f8:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i4@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i4
@@ -200,6 +250,12 @@ define i32 @f9() {
 ; X64_LINUX-LABEL: f9:
 ; X64_LINUX:      movl %fs:i5@TPOFF, %eax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f9:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movl _i5@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i32* @i5
@@ -215,6 +271,12 @@ define i32* @f10() {
 ; X64_LINUX:      movq %fs:0, %rax
 ; X64_LINUX-NEXT: leaq i5@TPOFF(%rax), %rax
 ; X64_LINUX-NEXT: ret
+; MINGW32-LABEL: _f10:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: leal _i5@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	ret i32* @i5
@@ -239,6 +301,12 @@ define i16 @f11() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movzwl s1@SECREL32(%rax), %eax
 ; X64_WIN:      ret
+; MINGW32-LABEL: _f11:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movzwl  _s1@SECREL32(%eax), %eax
+; MINGW32: retl
 
 entry:
 	%tmp1 = load i16* @s1
@@ -264,6 +332,13 @@ define i32 @f12() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movswl s1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f12:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movswl _s1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
+
 
 entry:
 	%tmp1 = load i16* @s1
@@ -290,6 +365,12 @@ define i8 @f13() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movb b1@SECREL32(%rax), %al
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f13:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movb _b1@SECREL32(%eax), %al
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i8* @b1
@@ -315,6 +396,12 @@ define i32 @f14() {
 ; X64_WIN-NEXT: movq (%rcx,%rax,8), %rax
 ; X64_WIN-NEXT: movsbl b1@SECREL32(%rax), %eax
 ; X64_WIN-NEXT: ret
+; MINGW32-LABEL: _f14:
+; MINGW32: movl __tls_index, %eax
+; MINGW32-NEXT: movl %fs:44, %ecx
+; MINGW32-NEXT: movl (%ecx,%eax,4), %eax
+; MINGW32-NEXT: movsbl  _b1@SECREL32(%eax), %eax
+; MINGW32-NEXT: retl
 
 entry:
 	%tmp1 = load i8* @b1
diff --git a/test/CodeGen/X86/vec_shuffle-41.ll b/test/CodeGen/X86/vec_shuffle-41.ll
new file mode 100644
index 0000000..28fdd2f
--- /dev/null
+++ b/test/CodeGen/X86/vec_shuffle-41.ll
@@ -0,0 +1,21 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=corei7-avx | FileCheck %s
+
+; Use buildFromShuffleMostly which allows this to be generated as two 128-bit
+; shuffles and an insert.
+
+; This is the (somewhat questionable) LLVM IR that is generated for:
+;    x8.s0123456 = x8.s1234567;  // x8 is a <8 x float> type
+;    x8.s7 = f;                  // f is float
+
+
+define <8 x float> @test1(<8 x float> %a, float %b) {
+; CHECK-LABEL: test1:
+; CHECK: vinsertps
+; CHECK-NOT: vinsertps
+entry:
+  %shift = shufflevector <8 x float> %a, <8 x float> undef, <7 x i32> <i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
+  %extend = shufflevector <7 x float> %shift, <7 x float> undef, <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 undef>
+  %insert = insertelement <8 x float> %extend, float %b, i32 7
+
+  ret <8 x float> %insert
+}
diff --git a/test/CodeGen/X86/vec_splat.ll b/test/CodeGen/X86/vec_splat.ll
index 543c96e..a02e383 100644
--- a/test/CodeGen/X86/vec_splat.ll
+++ b/test/CodeGen/X86/vec_splat.ll
@@ -32,3 +32,19 @@ define void @test_v2sd(<2 x double>* %P, <2 x double>* %Q, double %X) nounwind {
 ; SSE3-LABEL: test_v2sd:
 ; SSE3: movddup
 }
+
+; Fold extract of a load into the load's address computation. This avoids spilling to the stack.
+define <4 x float> @load_extract_splat(<4 x float>* nocapture readonly %ptr, i64 %i, i64 %j) nounwind {
+  %1 = getelementptr inbounds <4 x float>* %ptr, i64 %i
+  %2 = load <4 x float>* %1, align 16
+  %3 = extractelement <4 x float> %2, i64 %j
+  %4 = insertelement <4 x float> undef, float %3, i32 0
+  %5 = insertelement <4 x float> %4, float %3, i32 1
+  %6 = insertelement <4 x float> %5, float %3, i32 2
+  %7 = insertelement <4 x float> %6, float %3, i32 3
+  ret <4 x float> %7
+  
+; AVX-LABEL: load_extract_splat
+; AVX-NOT: movs
+; AVX: vbroadcastss
+}
diff --git a/test/CodeGen/X86/vector-idiv.ll b/test/CodeGen/X86/vector-idiv.ll
new file mode 100644
index 0000000..4c30184
--- /dev/null
+++ b/test/CodeGen/X86/vector-idiv.ll
@@ -0,0 +1,217 @@
+; RUN: llc -march=x86-64 -mcpu=core2 -mattr=+sse4.1 < %s | FileCheck %s -check-prefix=SSE41
+; RUN: llc -march=x86-64 -mcpu=core2 < %s | FileCheck %s -check-prefix=SSE
+; RUN: llc -march=x86-64 -mcpu=core-avx2 < %s | FileCheck %s -check-prefix=AVX
+
+define <4 x i32> @test1(<4 x i32> %a) {
+  %div = udiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; SSE41-LABEL: test1:
+; SSE41: pmuludq
+; SSE41: pshufd	$57
+; SSE41: pmuludq
+; SSE41: shufps	$-35
+; SSE41: psubd
+; SSE41: psrld $1
+; SSE41: padd
+; SSE41: psrld $2
+
+; AVX-LABEL: test1:
+; AVX: vpmuludq
+; AVX: vpshufd	$57
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+}
+
+define <8 x i32> @test2(<8 x i32> %a) {
+  %div = udiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; AVX-LABEL: test2:
+; AVX: vpermd
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+}
+
+define <8 x i16> @test3(<8 x i16> %a) {
+  %div = udiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; SSE41-LABEL: test3:
+; SSE41: pmulhuw
+; SSE41: psubw
+; SSE41: psrlw $1
+; SSE41: paddw
+; SSE41: psrlw $2
+
+; AVX-LABEL: test3:
+; AVX: vpmulhuw
+; AVX: vpsubw
+; AVX: vpsrlw $1
+; AVX: vpaddw
+; AVX: vpsrlw $2
+}
+
+define <16 x i16> @test4(<16 x i16> %a) {
+  %div = udiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; AVX-LABEL: test4:
+; AVX: vpmulhuw
+; AVX: vpsubw
+; AVX: vpsrlw $1
+; AVX: vpaddw
+; AVX: vpsrlw $2
+; AVX-NOT: vpmulhuw
+}
+
+define <8 x i16> @test5(<8 x i16> %a) {
+  %div = sdiv <8 x i16> %a, <i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7>
+  ret <8 x i16> %div
+
+; SSE41-LABEL: test5:
+; SSE41: pmulhw
+; SSE41: psrlw $15
+; SSE41: psraw $1
+; SSE41: paddw
+
+; AVX-LABEL: test5:
+; AVX: vpmulhw
+; AVX: vpsrlw $15
+; AVX: vpsraw $1
+; AVX: vpaddw
+}
+
+define <16 x i16> @test6(<16 x i16> %a) {
+  %div = sdiv <16 x i16> %a, <i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7, i16 7,i16 7, i16 7, i16 7, i16 7>
+  ret <16 x i16> %div
+
+; AVX-LABEL: test6:
+; AVX: vpmulhw
+; AVX: vpsrlw $15
+; AVX: vpsraw $1
+; AVX: vpaddw
+; AVX-NOT: vpmulhw
+}
+
+define <16 x i8> @test7(<16 x i8> %a) {
+  %div = sdiv <16 x i8> %a, <i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7, i8 7,i8 7, i8 7, i8 7, i8 7>
+  ret <16 x i8> %div
+}
+
+define <4 x i32> @test8(<4 x i32> %a) {
+  %div = sdiv <4 x i32> %a, <i32 7, i32 7, i32 7, i32 7>
+  ret <4 x i32> %div
+
+; SSE41-LABEL: test8:
+; SSE41: pmuldq
+; SSE41: pshufd	$57
+; SSE41-NOT: pshufd	$57
+; SSE41: pmuldq
+; SSE41: shufps	$-35
+; SSE41: pshufd	$-40
+; SSE41: padd
+; SSE41: psrld $31
+; SSE41: psrad $2
+; SSE41: padd
+
+; SSE-LABEL: test8:
+; SSE: psrad $31
+; SSE: pand
+; SSE: paddd
+; SSE: pmuludq
+; SSE: pshufd	$57
+; SSE-NOT: pshufd	$57
+; SSE: pmuludq
+; SSE: shufps	$-35
+; SSE: pshufd	$-40
+; SSE: psubd
+; SSE: padd
+; SSE: psrld $31
+; SSE: psrad $2
+; SSE: padd
+
+; AVX-LABEL: test8:
+; AVX: vpmuldq
+; AVX: vpshufd	$57
+; AVX-NOT: vpshufd	$57
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+}
+
+define <8 x i32> @test9(<8 x i32> %a) {
+  %div = sdiv <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %div
+
+; AVX-LABEL: test9:
+; AVX: vpbroadcastd
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+}
+
+define <8 x i32> @test10(<8 x i32> %a) {
+  %rem = urem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %rem
+
+; AVX-LABEL: test10:
+; AVX: vpbroadcastd
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpmuludq
+; AVX: vshufps	$-35
+; AVX: vpsubd
+; AVX: vpsrld $1
+; AVX: vpadd
+; AVX: vpsrld $2
+; AVX: vpmulld
+}
+
+define <8 x i32> @test11(<8 x i32> %a) {
+  %rem = srem <8 x i32> %a, <i32 7, i32 7, i32 7, i32 7,i32 7, i32 7, i32 7, i32 7>
+  ret <8 x i32> %rem
+
+; AVX-LABEL: test11:
+; AVX: vpbroadcastd
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpmuldq
+; AVX: vshufps	$-35
+; AVX: vpshufd	$-40
+; AVX: vpadd
+; AVX: vpsrld $31
+; AVX: vpsrad $2
+; AVX: vpadd
+; AVX: vpmulld
+}
+
+define <2 x i16> @test12() {
+  %I8 = insertelement <2 x i16> zeroinitializer, i16 -1, i32 0
+  %I9 = insertelement <2 x i16> %I8, i16 -1, i32 1
+  %B9 = urem <2 x i16> %I9, %I9
+  ret <2 x i16> %B9
+
+; AVX-LABEL: test12:
+; AVX: xorps
+}
diff --git a/test/CodeGen/X86/win32_sret.ll b/test/CodeGen/X86/win32_sret.ll
index d8ecd44..8728712 100644
--- a/test/CodeGen/X86/win32_sret.ll
+++ b/test/CodeGen/X86/win32_sret.ll
@@ -2,9 +2,11 @@
 ; some setups (e.g., Atom) from affecting the output.
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN
 ; RUN: llc < %s -mcpu=core2 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-win32 | FileCheck %s -check-prefix=WIN32
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-mingw32 | FileCheck %s -check-prefix=MINGW_X86
+; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i686-pc-cygwin | FileCheck %s -check-prefix=CYGWIN
 ; RUN: llc < %s -mcpu=core2 -O0 -mtriple=i386-pc-linux | FileCheck %s -check-prefix=LINUX
 
 ; The SysV ABI used by most Unixes and Mingw on x86 specifies that an sret pointer
@@ -21,6 +23,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret1:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret1:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret1:
 ; LINUX:      retl $4
 
@@ -38,6 +43,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret2:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret2:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret2:
 ; LINUX:      retl $4
 
@@ -56,6 +64,9 @@ entry:
 ; MINGW_X86-LABEL:  _sret3:
 ; MINGW_X86:  {{retl$}}
 
+; CYGWIN-LABEL:     _sret3:
+; CYGWIN:     retl $4
+
 ; LINUX-LABEL:      sret3:
 ; LINUX:      retl $4
 
@@ -77,6 +88,9 @@ entry:
 ; MINGW_X86-LABEL: _sret4:
 ; MINGW_X86: {{retl$}}
 
+; CYGWIN-LABEL:    _sret4:
+; CYGWIN:    retl $4
+
 ; LINUX-LABEL:     sret4:
 ; LINUX:     retl $4
 
@@ -98,6 +112,7 @@ entry:
   ret void
 ; WIN32-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 ; MINGW_X86-LABEL: {{^}}"?foo@C5@@QAE?AUS5@@XZ":
+; CYGWIN-LABEL:    {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 ; LINUX-LABEL:     {{^}}"?foo@C5@@QAE?AUS5@@XZ":
 
 ; The address of the return structure is passed as an implicit parameter.
@@ -115,6 +130,7 @@ entry:
   call x86_thiscallcc void @"\01?foo@C5@@QAE?AUS5@@XZ"(%struct.S5* sret %s, %class.C5* %c)
 ; WIN32-LABEL:      {{^}}_call_foo5:
 ; MINGW_X86-LABEL:  {{^}}_call_foo5:
+; CYGWIN-LABEL:     {{^}}_call_foo5:
 ; LINUX-LABEL:      {{^}}call_foo5:
 
 
@@ -135,6 +151,7 @@ entry:
 define void @test6_f(%struct.test6* %x) nounwind {
 ; WIN32-LABEL: _test6_f:
 ; MINGW_X86-LABEL: _test6_f:
+; CYGWIN-LABEL: _test6_f:
 ; LINUX-LABEL: test6_f:
 
 ; The %x argument is moved to %ecx. It will be the this pointer.
@@ -145,6 +162,9 @@ define void @test6_f(%struct.test6* %x) nounwind {
 ; MINGW_X86: movl    8(%ebp), %eax
 ; MINGW_X86: movl    %eax, (%e{{([a-d]x)|(sp)}})
 
+; CYGWIN: movl    8(%ebp), %eax
+; CYGWIN: movl    %eax, (%e{{([a-d]x)|(sp)}})
+
 ; The sret pointer is (%esp)
 ; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
 ; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
@@ -153,8 +173,71 @@ define void @test6_f(%struct.test6* %x) nounwind {
 ; MINGW_X86-NEXT: leal    8(%esp), %ecx
 ; MINGW_X86-NEXT: calll   _test6_g
 
+; CYGWIN-NEXT: leal    8(%esp), %ecx
+; CYGWIN-NEXT: calll   _test6_g
+
   %tmp = alloca %struct.test6, align 4
   call x86_thiscallcc void @test6_g(%struct.test6* sret %tmp, %struct.test6* %x)
   ret void
 }
 declare x86_thiscallcc void @test6_g(%struct.test6* sret, %struct.test6*)
+
+; Flipping the parameters at the IR level generates the same code.
+%struct.test7 = type { i32, i32, i32 }
+define void @test7_f(%struct.test7* %x) nounwind {
+; WIN32-LABEL: _test7_f:
+; MINGW_X86-LABEL: _test7_f:
+; CYGWIN-LABEL: _test7_f:
+; LINUX-LABEL: test7_f:
+
+; The %x argument is moved to %ecx on all OSs. It will be the this pointer.
+; WIN32:      movl    8(%ebp), %ecx
+; MINGW_X86:  movl    8(%ebp), %ecx
+; CYGWIN:     movl    8(%ebp), %ecx
+
+; The sret pointer is (%esp)
+; WIN32:          leal    8(%esp), %[[REG:e[a-d]x]]
+; WIN32-NEXT:     movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+; MINGW_X86:      leal    8(%esp), %[[REG:e[a-d]x]]
+; MINGW_X86-NEXT: movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+; CYGWIN:         leal    8(%esp), %[[REG:e[a-d]x]]
+; CYGWIN-NEXT:    movl    %[[REG]], (%e{{([a-d]x)|(sp)}})
+
+  %tmp = alloca %struct.test7, align 4
+  call x86_thiscallcc void @test7_g(%struct.test7* %x, %struct.test7* sret %tmp)
+  ret void
+}
+
+define x86_thiscallcc void @test7_g(%struct.test7* %in, %struct.test7* sret %out) {
+  %s = getelementptr %struct.test7* %in, i32 0, i32 0
+  %d = getelementptr %struct.test7* %out, i32 0, i32 0
+  %v = load i32* %s
+  store i32 %v, i32* %d
+  call void @clobber_eax()
+  ret void
+
+; Make sure we return the second parameter in %eax.
+; WIN32-LABEL: _test7_g:
+; WIN32: calll _clobber_eax
+; WIN32: movl {{.*}}, %eax
+; WIN32: retl
+}
+
+declare void @clobber_eax()
+
+; Test what happens if the first parameter has to be split by codegen.
+; Realistically, no frontend will generate code like this, but here it is for
+; completeness.
+define void @test8_f(i64 inreg %a, i64* sret %out) {
+  store i64 %a, i64* %out
+  call void @clobber_eax()
+  ret void
+
+; WIN32-LABEL: _test8_f:
+; WIN32: movl {{[0-9]+}}(%esp), %[[out:[a-z]+]]
+; WIN32-DAG: movl %edx, 4(%[[out]])
+; WIN32-DAG: movl %eax, (%[[out]])
+; WIN32: calll _clobber_eax
+; WIN32: movl {{.*}}, %eax
+; WIN32: retl
+}
diff --git a/test/CodeGen/X86/x86-64-sret-return-2.ll b/test/CodeGen/X86/x86-64-sret-return-2.ll
new file mode 100644
index 0000000..9f57ee1
--- /dev/null
+++ b/test/CodeGen/X86/x86-64-sret-return-2.ll
@@ -0,0 +1,18 @@
+; RUN: llc -mtriple=x86_64-apple-darwin8 < %s | FileCheck %s
+; RUN: llc -mtriple=x86_64-pc-linux < %s | FileCheck %s
+
+; FIXME: x32 doesn't know how to select this.  This isn't a regression, it never
+; worked.
+; RUNX: llc -mtriple=x86_64-pc-linux-gnux32 < %s | FileCheck -check-prefix=X32ABI %s
+
+; This used to crash due to topological sorting issues in selection DAG.
+define void @foo(i32* sret %agg.result, i32, i32, i32, i32, i32, void (i32)* %pred) {
+entry:
+  call void %pred(i32 undef)
+  ret void
+
+; CHECK-LABEL: foo:
+; CHECK: callq
+; CHECK: movq {{.*}}, %rax
+; CHECK: ret
+}
author	Stephen Hines <srhines@google.com>	2014-05-29 02:49:00 -0700
committer	Stephen Hines <srhines@google.com>	2014-05-29 02:49:00 -0700
commit	dce4a407a24b04eebc6a376f8e62b41aaa7b071f (patch)
tree	dcebc53f2b182f145a2e659393bf9a0472cedf23 /test/CodeGen/X86
parent	220b921aed042f9e520c26cffd8282a94c66c3d5 (diff)
download	external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.zip external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.gz external_llvm-dce4a407a24b04eebc6a376f8e62b41aaa7b071f.tar.bz2