26 files changed, 982 insertions, 100 deletions
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b67..0465952 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000..6ebbb2e
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000..0f36ce2
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,362 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+ declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000..e7c9605
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa8..95854c7 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -109,8 +109,8 @@ allocas:
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709..ec11654 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6ca..ff56a45 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa..267a806 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,37 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9..09eb5d1 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@ define i32 @bax(<2 x i64> %c) {
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663..85a70aa 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a..bd3514c 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e0..e3910a6 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b3..2fe1ecd 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87..6d98d59 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000..091f0de
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=x86-64 -mattr=-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb..597236e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 0000000..014132b
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0Amov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e5..51320dd 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000..84102f1
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond
+; AVX:   vptest
+; AVX:   ret
+}
+
+define i32 @vectest(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest
+; AVX:   vptest
+; AVX:   ret
+}
+
+define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel
+; AVX:   vptest
+; AVX:   ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000..024b163
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d23..b823f0a 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 0000000..82517cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 0000000..5e0160b
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e..dc0464f 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6b..8dfc2ea 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom -mattr=+sse41 | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d14..dfaa3d6 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]