67 files changed, 1951 insertions, 137 deletions
diff --git a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
index f80b44f..1769ee5 100644
--- a/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
+++ b/test/CodeGen/ARM/2011-11-29-128bitArithmetics.ll
@@ -300,3 +300,34 @@ L.entry:
 
 declare <4 x float> @llvm.sin.v4f32(<4 x float>) nounwind readonly
 
+define void @test_floor(<4 x float>* %X) nounwind {
+
+; CHECK: test_floor:
+
+; CHECK:      movw  [[reg0:r[0-9]+]], :lower16:{{.*}}
+; CHECK:      movt  [[reg0]], :upper16:{{.*}}
+; CHECK:      vldmia r{{[0-9][0-9]?}}, {{.*}}
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      {{v?mov(.32)?}}  r0,
+; CHECK:      bl  {{.*}}floorf
+
+; CHECK:      vstmia  {{.*}}
+
+L.entry:
+  %0 = load <4 x float>* @A, align 16
+  %1 = call <4 x float> @llvm.floor.v4f32(<4 x float> %0)
+  store <4 x float> %1, <4 x float>* %X, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.floor.v4f32(<4 x float>) nounwind readonly
+
diff --git a/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
new file mode 100644
index 0000000..ec7f72d
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-27-CopyPhysRegCrash.ll
@@ -0,0 +1,129 @@
+; RUN: llc < %s -mcpu=cortex-a8 -march=thumb
+; Test that this doesn't crash.
+; <rdar://problem/12183003>
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:32:64-v128:32:128-a0:0:32-n32-S32"
+target triple = "thumbv7-apple-ios5.1.0"
+
+declare { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8*, i32) nounwind readonly
+
+declare void @llvm.arm.neon.vst1.v16i8(i8*, <16 x i8>, i32) nounwind
+
+define void @findEdges(i8*) nounwind ssp {
+  %2 = icmp sgt i32 undef, 0
+  br i1 %2, label %5, label %3
+
+; <label>:3                                       ; preds = %5, %1
+  %4 = phi i8* [ %0, %1 ], [ %19, %5 ]
+  ret void
+
+; <label>:5                                       ; preds = %5, %1
+  %6 = phi i8* [ %19, %5 ], [ %0, %1 ]
+  %7 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* null, i32 1)
+  %8 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %7, 0
+  %9 = getelementptr inbounds i8* null, i32 3
+  %10 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %9, i32 1)
+  %11 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %10, 2
+  %12 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %6, i32 1)
+  %13 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 0
+  %14 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %12, 1
+  %15 = getelementptr inbounds i8* %6, i32 3
+  %16 = tail call { <16 x i8>, <16 x i8>, <16 x i8> } @llvm.arm.neon.vld3.v16i8(i8* %15, i32 1)
+  %17 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 1
+  %18 = extractvalue { <16 x i8>, <16 x i8>, <16 x i8> } %16, 2
+  %19 = getelementptr inbounds i8* %6, i32 48
+  %20 = bitcast <16 x i8> %13 to <2 x i64>
+  %21 = bitcast <16 x i8> %8 to <2 x i64>
+  %22 = bitcast <16 x i8> %14 to <2 x i64>
+  %23 = shufflevector <2 x i64> %22, <2 x i64> undef, <1 x i32> zeroinitializer
+  %24 = bitcast <1 x i64> %23 to <8 x i8>
+  %25 = zext <8 x i8> %24 to <8 x i16>
+  %26 = sub <8 x i16> zeroinitializer, %25
+  %27 = bitcast <16 x i8> %17 to <2 x i64>
+  %28 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %26) nounwind
+  %29 = mul <8 x i16> %28, %28
+  %30 = add <8 x i16> zeroinitializer, %29
+  %31 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> undef, <8 x i16> %30) nounwind
+  %32 = bitcast <16 x i8> %11 to <2 x i64>
+  %33 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> zeroinitializer
+  %34 = bitcast <1 x i64> %33 to <8 x i8>
+  %35 = zext <8 x i8> %34 to <8 x i16>
+  %36 = sub <8 x i16> %35, zeroinitializer
+  %37 = bitcast <16 x i8> %18 to <2 x i64>
+  %38 = shufflevector <2 x i64> %37, <2 x i64> undef, <1 x i32> zeroinitializer
+  %39 = bitcast <1 x i64> %38 to <8 x i8>
+  %40 = zext <8 x i8> %39 to <8 x i16>
+  %41 = sub <8 x i16> zeroinitializer, %40
+  %42 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %36) nounwind
+  %43 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %41) nounwind
+  %44 = mul <8 x i16> %42, %42
+  %45 = mul <8 x i16> %43, %43
+  %46 = add <8 x i16> %45, %44
+  %47 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %31, <8 x i16> %46) nounwind
+  %48 = bitcast <8 x i16> %47 to <2 x i64>
+  %49 = shufflevector <2 x i64> %48, <2 x i64> undef, <1 x i32> zeroinitializer
+  %50 = bitcast <1 x i64> %49 to <4 x i16>
+  %51 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %50, <4 x i16> undef) nounwind
+  %52 = tail call <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32> %51, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %53 = bitcast <4 x i16> %52 to <1 x i64>
+  %54 = shufflevector <1 x i64> %53, <1 x i64> undef, <2 x i32> <i32 0, i32 1>
+  %55 = bitcast <2 x i64> %54 to <8 x i16>
+  %56 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %55, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %57 = shufflevector <2 x i64> %20, <2 x i64> undef, <1 x i32> <i32 1>
+  %58 = bitcast <1 x i64> %57 to <8 x i8>
+  %59 = zext <8 x i8> %58 to <8 x i16>
+  %60 = sub <8 x i16> zeroinitializer, %59
+  %61 = shufflevector <2 x i64> %21, <2 x i64> undef, <1 x i32> <i32 1>
+  %62 = bitcast <1 x i64> %61 to <8 x i8>
+  %63 = zext <8 x i8> %62 to <8 x i16>
+  %64 = sub <8 x i16> %63, zeroinitializer
+  %65 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %60) nounwind
+  %66 = mul <8 x i16> %65, %65
+  %67 = add <8 x i16> zeroinitializer, %66
+  %68 = shufflevector <2 x i64> %27, <2 x i64> undef, <1 x i32> <i32 1>
+  %69 = bitcast <1 x i64> %68 to <8 x i8>
+  %70 = zext <8 x i8> %69 to <8 x i16>
+  %71 = sub <8 x i16> zeroinitializer, %70
+  %72 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> undef) nounwind
+  %73 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %71) nounwind
+  %74 = mul <8 x i16> %72, %72
+  %75 = mul <8 x i16> %73, %73
+  %76 = add <8 x i16> %75, %74
+  %77 = shufflevector <2 x i64> %32, <2 x i64> undef, <1 x i32> <i32 1>
+  %78 = bitcast <1 x i64> %77 to <8 x i8>
+  %79 = zext <8 x i8> %78 to <8 x i16>
+  %80 = sub <8 x i16> %79, zeroinitializer
+  %81 = tail call <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16> %80) nounwind
+  %82 = mul <8 x i16> %81, %81
+  %83 = add <8 x i16> zeroinitializer, %82
+  %84 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %76, <8 x i16> %83) nounwind
+  %85 = tail call <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16> %67, <8 x i16> %84) nounwind
+  %86 = bitcast <8 x i16> %85 to <2 x i64>
+  %87 = shufflevector <2 x i64> %86, <2 x i64> undef, <1 x i32> <i32 1>
+  %88 = bitcast <1 x i64> %87 to <4 x i16>
+  %89 = tail call <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16> %88, <4 x i16> undef) nounwind
+  %90 = tail call <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32> %89, <4 x i32> <i32 -6, i32 -6, i32 -6, i32 -6>)
+  %91 = bitcast <4 x i16> %90 to <1 x i64>
+  %92 = shufflevector <1 x i64> undef, <1 x i64> %91, <2 x i32> <i32 0, i32 1>
+  %93 = bitcast <2 x i64> %92 to <8 x i16>
+  %94 = tail call <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16> %93, <8 x i16> <i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8, i16 -8>)
+  %95 = bitcast <8 x i8> %56 to <1 x i64>
+  %96 = bitcast <8 x i8> %94 to <1 x i64>
+  %97 = shufflevector <1 x i64> %95, <1 x i64> %96, <2 x i32> <i32 0, i32 1>
+  %98 = bitcast <2 x i64> %97 to <16 x i8>
+  tail call void @llvm.arm.neon.vst1.v16i8(i8* null, <16 x i8> %98, i32 1)
+  %99 = icmp slt i32 undef, undef
+  br i1 %99, label %5, label %3
+}
+
+declare <4 x i16> @llvm.arm.neon.vqshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <8 x i8> @llvm.arm.neon.vshiftn.v8i8(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <4 x i16> @llvm.arm.neon.vqrshiftnu.v4i16(<4 x i32>, <4 x i32>) nounwind readnone
+
+declare <4 x i32> @llvm.arm.neon.vmullu.v4i32(<4 x i16>, <4 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vmaxu.v8i16(<8 x i16>, <8 x i16>) nounwind readnone
+
+declare <8 x i16> @llvm.arm.neon.vabs.v8i16(<8 x i16>) nounwind readnone
diff --git a/test/CodeGen/ARM/2012-08-30-select.ll b/test/CodeGen/ARM/2012-08-30-select.ll
new file mode 100644
index 0000000..8471be5
--- /dev/null
+++ b/test/CodeGen/ARM/2012-08-30-select.ll
@@ -0,0 +1,18 @@
+; RUN: llc < %s -mtriple=thumbv7-apple-ios | FileCheck %s
+; rdar://12201387
+
+;CHECK: select_s_v_v
+;CHECK: it  ne
+;CHECK-NEXT: vmovne.i32
+;CHECK: bx
+define <16 x i8> @select_s_v_v(i32 %avail, i8* %bar) {
+entry:
+  %vld1 = call <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* %bar, i32 1)
+  %and = and i32 %avail, 1
+  %tobool = icmp eq i32 %and, 0
+  %vld1. = select i1 %tobool, <16 x i8> %vld1, <16 x i8> zeroinitializer
+  ret <16 x i8> %vld1.
+}
+
+declare <16 x i8> @llvm.arm.neon.vld1.v16i8(i8* , i32 )
+
diff --git a/test/CodeGen/ARM/atomic-op.ll b/test/CodeGen/ARM/atomic-op.ll
index 8967730..6e6b363 100644
--- a/test/CodeGen/ARM/atomic-op.ll
+++ b/test/CodeGen/ARM/atomic-op.ll
@@ -159,3 +159,13 @@ entry:
   store i8 %3, i8* %old
   ret void
 }
+
+; CHECK: func4
+; This function should not need to use callee-saved registers.
+; rdar://problem/12203728
+; CHECK-NOT: r4
+define i32 @func4(i32* %p) nounwind optsize ssp {
+entry:
+  %0 = atomicrmw add i32* %p, i32 1 monotonic
+  ret i32 %0
+}
diff --git a/test/CodeGen/ARM/crash-shufflevector.ll b/test/CodeGen/ARM/crash-shufflevector.ll
index ece4234..bdc0e0e 100644
--- a/test/CodeGen/ARM/crash-shufflevector.ll
+++ b/test/CodeGen/ARM/crash-shufflevector.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=armv7--
+; RUN: llc < %s -mtriple=armv7
 
 declare void @g(<16 x i8>)
 define void @f(<4 x i8> %param1, <4 x i8> %param2) {
@@ -7,4 +7,4 @@ define void @f(<4 x i8> %param1, <4 x i8> %param2) {
    %z = shufflevector <16 x i8> %y1, <16 x i8> %y2, <16 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 16, i32 17, i32 18, i32 19>
    call void @g(<16 x i8> %z)
    ret void
-}
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/domain-conv-vmovs.ll b/test/CodeGen/ARM/domain-conv-vmovs.ll
new file mode 100644
index 0000000..e19185b
--- /dev/null
+++ b/test/CodeGen/ARM/domain-conv-vmovs.ll
@@ -0,0 +1,84 @@
+; RUN: llc -verify-machineinstrs -mtriple=armv7-none-linux-gnueabi -mcpu=cortex-a9 -mattr=+neon,+neonfp -float-abi=hard < %s | FileCheck %s
+
+define <2 x float> @test_vmovs_via_vext_lane0to0(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane0to1(float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane0to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d1, d0, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to0(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to0:
+  %vec = insertelement <2 x float> %in, float %arg, i32 0
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+define <2 x float> @test_vmovs_via_vext_lane1to1(float, float %arg, <2 x float> %in) {
+; CHECK: test_vmovs_via_vext_lane1to1:
+  %vec = insertelement <2 x float> %in, float %arg, i32 1
+  %res = fadd <2 x float> %vec, %vec
+
+; CHECK: vext.32 d1, d0, d1, #1
+; CHECK: vext.32 d1, d1, d1, #1
+; CHECK: vadd.f32 {{d[0-9]+}}, d1, d1
+
+  ret <2 x float> %res
+}
+
+
+define float @test_vmovs_via_vdup(float, float %ret, float %lhs, float %rhs) {
+; CHECK: test_vmovs_via_vdup:
+
+  ; Do an operation (which will end up NEON because of +neonfp) to convince the
+  ; execution-domain pass that NEON is a good thing to use.
+  %res = fadd float %ret, %ret
+  ;  It makes sense for LLVM to do the addition in d0 here, because it's going
+  ;  to be returned. This means it will want a "vmov s0, s1":
+; CHECK: vdup.32 d0, d0[1]
+
+  ret float %res
+}
+
+declare float @llvm.sqrt.f32(float)
+
+declare void @bar()
+
+; This is a comp
+define float @test_ineligible(float, float %in) {
+; CHECK: test_ineligible:
+
+  %sqrt = call float @llvm.sqrt.f32(float %in)
+  %val = fadd float %sqrt, %sqrt
+
+  ; This call forces a move from a callee-saved register to the return-reg. That
+  ; move is not eligible for conversion to a d-register instructions because the
+  ; use-def chains would be messed up. Primarily a compile-test (we used to
+  ; internal fault).
+  call void @bar()
+; CHECL: bl bar
+; CHECK: vmov.f32 {{s[0-9]+}}, {{s[0-9]+}}
+  ret float %val
+}
+\ No newline at end of file
diff --git a/test/CodeGen/ARM/fast-isel-pic.ll b/test/CodeGen/ARM/fast-isel-pic.ll
new file mode 100644
index 0000000..392a845
--- /dev/null
+++ b/test/CodeGen/ARM/fast-isel-pic.ll
@@ -0,0 +1,43 @@
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=thumbv7-apple-ios | FileCheck %s --check-prefix=THUMB
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=arm-apple-ios | FileCheck %s --check-prefix=ARM
+; RUN: llc < %s -O0 -fast-isel-abort -relocation-model=pic -mtriple=armv7-apple-ios | FileCheck %s --check-prefix=ARMv7
+
+@g = global i32 0, align 4
+
+define i32 @LoadGV() {
+entry:
+; THUMB: LoadGV
+; THUMB: movw [[reg0:r[0-9]+]],
+; THUMB: movt [[reg0]],
+; THUMB: add  [[reg0]], pc
+; ARM: LoadGV
+; ARM: ldr [[reg1:r[0-9]+]],
+; ARM: add [[reg1]], pc, [[reg1]]
+; ARMv7: LoadGV
+; ARMv7: movw [[reg2:r[0-9]+]],
+; ARMv7: movt [[reg2]],
+; ARMv7: add  [[reg2]], pc, [[reg2]]
+  %tmp = load i32* @g
+  ret i32 %tmp
+}
+
+@i = external global i32
+
+define i32 @LoadIndirectSymbol() {
+entry:
+; THUMB: LoadIndirectSymbol
+; THUMB: movw r[[reg3:[0-9]+]],
+; THUMB: movt r[[reg3]],
+; THUMB: add  r[[reg3]], pc
+; THUMB: ldr  r[[reg3]], [r[[reg3]]]
+; ARM: LoadIndirectSymbol
+; ARM: ldr [[reg4:r[0-9]+]],
+; ARM: ldr [[reg4]], [pc, [[reg4]]]
+; ARMv7: LoadIndirectSymbol
+; ARMv7: movw r[[reg5:[0-9]+]],
+; ARMv7: movt r[[reg5]],
+; ARMv7: add  r[[reg5]], pc, r[[reg5]]
+; ARMv7: ldr  r[[reg5]], [r[[reg5]]]
+  %tmp = load i32* @i
+  ret i32 %tmp
+}
diff --git a/test/CodeGen/ARM/fp-fast.ll b/test/CodeGen/ARM/fp-fast.ll
new file mode 100644
index 0000000..ec57187
--- /dev/null
+++ b/test/CodeGen/ARM/fp-fast.ll
@@ -0,0 +1,60 @@
+; RUN: llc -march=arm -mcpu=cortex-a9 -mattr=+vfp4 -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %t1)
+  ret float %t2
+}
+
+; CHECK: test2
+define float @test2(float %x, float %y) {
+; CHECK-NOT: vmul
+; CHECK: vfma.f32
+; CHECK-NOT: vmul
+  %t1 = fmul float %x, 3.0
+  %t2 = call float @llvm.fma.f32(float %t1, float 2.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test3
+define float @test3(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vadd.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test4
+define float @test4(float %x, float %y) {
+; CHECK-NOT: vfma
+; CHECK: vsub.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float -1.0, float %y)
+  ret float %t2
+}
+
+; CHECK: test5
+define float @test5(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t2 = call float @llvm.fma.f32(float %x, float 2.0, float %x)
+  ret float %t2
+}
+
+; CHECK: test6
+define float @test6(float %x) {
+; CHECK-NOT: vfma
+; CHECK: vmul.f32
+; CHECK-NOT: vfma
+  %t1 = fsub float -0.0, %x
+  %t2 = call float @llvm.fma.f32(float %x, float 5.0, float %t1)
+  ret float %t2
+}
+
+declare float @llvm.fma.f32(float, float, float)
diff --git a/test/CodeGen/ARM/integer_insertelement.ll b/test/CodeGen/ARM/integer_insertelement.ll
new file mode 100644
index 0000000..4f2d7e3
--- /dev/null
+++ b/test/CodeGen/ARM/integer_insertelement.ll
@@ -0,0 +1,35 @@
+; RUN: llc %s -o - -march=arm -mattr=+neon | FileCheck %s
+
+; This test checks that when inserting one (integer) element into a vector,
+; the vector is not spuriously copied. "vorr dX, dY, dY" is the way of moving
+; one DPR to another that we check for.
+
+; CHECK: @f
+; CHECK-NOT: vorr d
+; CHECK: vmov s
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <4 x i32> @f(<4 x i32> %in) {
+  %1 = insertelement <4 x i32> %in, i32 255, i32 3
+  ret <4 x i32> %1
+}
+
+; CHECK: @g
+; CHECK-NOT: vorr d
+; CHECK: vmov.16 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <8 x i16> @g(<8 x i16> %in) {
+  %1 = insertelement <8 x i16> %in, i16 255, i32 7
+  ret <8 x i16> %1
+}
+
+; CHECK: @h
+; CHECK-NOT: vorr d
+; CHECK: vmov.8 d
+; CHECK-NOT: vorr d
+; CHECK: mov pc, lr
+define <16 x i8> @h(<16 x i8> %in) {
+  %1 = insertelement <16 x i8> %in, i8 255, i32 15
+  ret <16 x i8> %1
+}
diff --git a/test/CodeGen/ARM/longMAC.ll b/test/CodeGen/ARM/longMAC.ll
new file mode 100644
index 0000000..e4a00e9
--- /dev/null
+++ b/test/CodeGen/ARM/longMAC.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=arm | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/ARM/select.ll b/test/CodeGen/ARM/select.ll
index 5575566..62708ed 100644
--- a/test/CodeGen/ARM/select.ll
+++ b/test/CodeGen/ARM/select.ll
@@ -80,7 +80,7 @@ define double @f7(double %a, double %b) {
 ; CHECK-NEON:      adr     [[R2:r[0-9]+]], LCPI7_0
 ; CHECK-NEON-NEXT: cmp     r0, [[R3]]
 ; CHECK-NEON-NEXT: it      eq
-; CHECK-NEON-NEXT: addeq.w {{r.*}}, [[R2]]
+; CHECK-NEON-NEXT: addeq{{.*}} [[R2]], #4
 ; CHECK-NEON-NEXT: ldr
 ; CHECK-NEON:      bx
 
diff --git a/test/CodeGen/ARM/select_xform.ll b/test/CodeGen/ARM/select_xform.ll
index cfc0e70..7507808 100644
--- a/test/CodeGen/ARM/select_xform.ll
+++ b/test/CodeGen/ARM/select_xform.ll
@@ -9,7 +9,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 
 ; T2: t1:
 ; T2: mvn r0, #-2147483648
-; T2: addle.w r1, r1
+; T2: addle r1, r0
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -23,7 +23,7 @@ define i32 @t2(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; ARM: mov r0, r1
 
 ; T2: t2:
-; T2: suble.w r1, r1, #10
+; T2: suble r1, #10
 ; T2: mov r0, r1
   %tmp1 = icmp sgt i32 %c, 10
   %tmp2 = select i1 %tmp1, i32 0, i32 10
@@ -37,7 +37,7 @@ define i32 @t3(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: mov r0, r3
 
 ; T2: t3:
-; T2: andge.w r3, r3, r2
+; T2: andge r3, r2
 ; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 -1, i32 %x
@@ -51,7 +51,7 @@ define i32 @t4(i32 %a, i32 %b, i32 %x, i32 %y) nounwind {
 ; ARM: mov r0, r3
 
 ; T2: t4:
-; T2: orrge.w r3, r3, r2
+; T2: orrge r3, r2
 ; T2: mov r0, r3
   %cond = icmp slt i32 %a, %b
   %z = select i1 %cond, i32 0, i32 %x
@@ -81,7 +81,7 @@ define i32 @t6(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 
 ; T2: t6:
 ; T2-NOT: movge
-; T2: eorlt.w r3, r3, r2
+; T2: eorlt r3, r2
   %cond = icmp slt i32 %a, %b
   %tmp1 = select i1 %cond, i32 %c, i32 0
   %tmp2 = xor i32 %tmp1, %d
@@ -200,7 +200,7 @@ entry:
 
 ; T2: t13
 ; T2: cmp r1, #10
-; T2: addgt.w r0, r0, #1
+; T2: addgt r0, #1
   %cmp = icmp sgt i32 %a, 10
   %conv = zext i1 %cmp to i32
   %add = add i32 %conv, %c
@@ -216,7 +216,7 @@ entry:
 
 ; T2: t14
 ; T2: cmp r1, #10
-; T2: subgt.w r0, r0, #1
+; T2: subgt r0, #1
   %cmp = icmp sgt i32 %a, 10
   %conv = sext i1 %cmp to i32
   %add = add i32 %conv, %c
diff --git a/test/CodeGen/ARM/sub-cmp-peephole.ll b/test/CodeGen/ARM/sub-cmp-peephole.ll
index 6fcbdee..2961b94 100644
--- a/test/CodeGen/ARM/sub-cmp-peephole.ll
+++ b/test/CodeGen/ARM/sub-cmp-peephole.ll
@@ -63,3 +63,24 @@ if.then:
 if.else:
   ret i32 %sub
 }
+
+; If the sub/rsb instruction is predicated, we can't use the flags.
+; <rdar://problem/12263428>
+; Test case from MultiSource/Benchmarks/Ptrdist/bc/number.s
+; CHECK: bc_raise
+; CHECK: rsbeq
+; CHECK: cmp
+define i32 @bc_raise() nounwind ssp {
+entry:
+  %val.2.i = select i1 undef, i32 0, i32 undef
+  %sub.i = sub nsw i32 0, %val.2.i
+  %retval.0.i = select i1 undef, i32 %val.2.i, i32 %sub.i
+  %cmp1 = icmp eq i32 %retval.0.i, 0
+  br i1 %cmp1, label %land.lhs.true, label %if.end11
+
+land.lhs.true:                                    ; preds = %num2long.exit
+  ret i32 17
+
+if.end11:                                         ; preds = %num2long.exit
+  ret i32 23
+}
diff --git a/test/CodeGen/ARM/vdup.ll b/test/CodeGen/ARM/vdup.ll
index 05332e4..a8c224b 100644
--- a/test/CodeGen/ARM/vdup.ll
+++ b/test/CodeGen/ARM/vdup.ll
@@ -261,3 +261,37 @@ define void @redundantVdup(<8 x i8>* %ptr) nounwind {
   store <8 x i8> %2, <8 x i8>* %ptr, align 8
   ret void
 }
+
+define <4 x i32> @tdupi(i32 %x, i32 %y) {
+;CHECK: tdupi
+;CHECK: vdup.32
+  %1 = insertelement <4 x i32> undef, i32 %x, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %x, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %x, i32 2
+  %4 = insertelement <4 x i32> %3, i32 %y, i32 3
+  ret <4 x i32> %4
+}
+
+define <4 x float> @tdupf(float %x, float %y) {
+;CHECK: tdupf
+;CHECK: vdup.32
+  %1 = insertelement <4 x float> undef, float %x, i32 0
+  %2 = insertelement <4 x float> %1, float %x, i32 1
+  %3 = insertelement <4 x float> %2, float %x, i32 2
+  %4 = insertelement <4 x float> %3, float %y, i32 3
+  ret <4 x float> %4
+}
+
+; This test checks that when splatting an element from a vector into another,
+; the value isn't moved out to GPRs first.
+define <4 x i32> @tduplane(<4 x i32> %invec) {
+;CHECK: tduplane
+;CHECK-NOT: vmov {{.*}}, d16[1]
+;CHECK: vdup.32 {{.*}}, d16[1]
+  %in = extractelement <4 x i32> %invec, i32 1
+  %1 = insertelement <4 x i32> undef, i32 %in, i32 0
+  %2 = insertelement <4 x i32> %1, i32 %in, i32 1
+  %3 = insertelement <4 x i32> %2, i32 %in, i32 2
+  %4 = insertelement <4 x i32> %3, i32 255, i32 3
+  ret <4 x i32> %4
+}
diff --git a/test/CodeGen/ARM/vector-extend-narrow.ll b/test/CodeGen/ARM/vector-extend-narrow.ll
index 8fd3db2..22af797 100644
--- a/test/CodeGen/ARM/vector-extend-narrow.ll
+++ b/test/CodeGen/ARM/vector-extend-narrow.ll
@@ -62,3 +62,14 @@ define <4 x i8> @i(<4 x i8>* %x) {
   %2 = sdiv <4 x i8> zeroinitializer, %1
   ret <4 x i8> %2
 }
+; CHECK: j:
+define <4 x i32> @j(<4 x i8>* %in) nounwind {
+  ; CHECK: vld1
+  ; CHECK: vmovl.u8
+  ; CHECK: vmovl.u16
+  ; CHECK-NOT: vand
+  %1 = load <4 x i8>* %in, align 4
+  %2 = zext <4 x i8> %1 to <4 x i32>
+  ret <4 x i32> %2
+}
+
diff --git a/test/CodeGen/ARM/vget_lane.ll b/test/CodeGen/ARM/vget_lane.ll
index 1fc885d..2ed65c9 100644
--- a/test/CodeGen/ARM/vget_lane.ll
+++ b/test/CodeGen/ARM/vget_lane.ll
@@ -200,7 +200,7 @@ define <8 x i16> @vsetQ_lane16(<8 x i16>* %A, i16 %B) nounwind {
 
 define <4 x i32> @vsetQ_lane32(<4 x i32>* %A, i32 %B) nounwind {
 ;CHECK: vsetQ_lane32:
-;CHECK: vmov.32
+;CHECK: vmov s
 	%tmp1 = load <4 x i32>* %A
 	%tmp2 = insertelement <4 x i32> %tmp1, i32 %B, i32 1
 	ret <4 x i32> %tmp2
diff --git a/test/CodeGen/Generic/MachineBranchProb.ll b/test/CodeGen/Generic/MachineBranchProb.ll
new file mode 100644
index 0000000..802ee2c
--- /dev/null
+++ b/test/CodeGen/Generic/MachineBranchProb.ll
@@ -0,0 +1,32 @@
+; RUN: llc < %s -print-machineinstrs=expand-isel-pseudos -o /dev/null 2>&1 | FileCheck %s
+
+; Make sure we have the correct weight attached to each successor.
+define i32 @test2(i32 %x) nounwind uwtable readnone ssp {
+; CHECK: Machine code for function test2:
+entry:
+  %conv = sext i32 %x to i64
+  switch i64 %conv, label %return [
+    i64 0, label %sw.bb
+    i64 1, label %sw.bb
+    i64 4, label %sw.bb
+    i64 5, label %sw.bb1
+  ], !prof !0
+; CHECK: BB#0: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#2(64) BB#4(14)
+; CHECK: BB#4: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(10) BB#5(4)
+; CHECK: BB#5: derived from LLVM BB %entry
+; CHECK: Successors according to CFG: BB#1(4) BB#3(7)
+
+sw.bb:
+  br label %return
+
+sw.bb1:
+  br label %return
+
+return:
+  %retval.0 = phi i32 [ 5, %sw.bb1 ], [ 1, %sw.bb ], [ 0, %entry ]
+  ret i32 %retval.0
+}
+
+!0 = metadata !{metadata !"branch_weights", i32 7, i32 6, i32 4, i32 4, i32 64}
diff --git a/test/CodeGen/Hexagon/args.ll b/test/CodeGen/Hexagon/args.ll
index e9ac8b6..8a6efb6 100644
--- a/test/CodeGen/Hexagon/args.ll
+++ b/test/CodeGen/Hexagon/args.ll
@@ -1,12 +1,12 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; CHECK: r[[T0:[0-9]+]] = #7
 ; CHECK: memw(r29 + #0) = r[[T0]]
+; CHECK: r5 = #6
 ; CHECK: r0 = #1
 ; CHECK: r1 = #2
 ; CHECK: r2 = #3
 ; CHECK: r3 = #4
 ; CHECK: r4 = #5
-; CHECK: r5 = #6
 
 
 define void @foo() nounwind {
diff --git a/test/CodeGen/Hexagon/newvaluestore.ll b/test/CodeGen/Hexagon/newvaluestore.ll
index ab69b22..186e393 100644
--- a/test/CodeGen/Hexagon/newvaluestore.ll
+++ b/test/CodeGen/Hexagon/newvaluestore.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4  < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-hexagon-misched < %s | FileCheck %s
 ; Check that we generate new value store packet in V4
 
 @i = global i32 0, align 4
diff --git a/test/CodeGen/Hexagon/remove_lsr.ll b/test/CodeGen/Hexagon/remove_lsr.ll
new file mode 100644
index 0000000..79b5f4a
--- /dev/null
+++ b/test/CodeGen/Hexagon/remove_lsr.ll
@@ -0,0 +1,80 @@
+; Test fix for PR-13709.
+; RUN: llc -march=hexagon -mcpu=hexagonv4 < %s | FileCheck %s
+; CHECK: foo
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+; CHECK-NOT: lsr(r{{[0-9]+}}:{{[0-9]+}}, #32)
+
+; Convert the sequence
+; r17:16 = lsr(r11:10, #32)
+; .. = r16
+; into
+; r17:16 = lsr(r11:10, #32)
+; .. = r11
+; This makes the lsr instruction dead and it gets removed subsequently
+; by a dead code removal pass.
+
+%union.vect64 = type { i64 }
+%union.vect32 = type { i32 }
+
+define void @foo(%union.vect64* nocapture %sss_extracted_bit_rx_data_ptr,
+ %union.vect32* nocapture %s_even, %union.vect32* nocapture %s_odd,
+ i8* nocapture %scr_s_even_code_ptr, i8* nocapture %scr_s_odd_code_ptr)
+ nounwind {
+entry:
+  %scevgep = getelementptr %union.vect64* %sss_extracted_bit_rx_data_ptr, i32 1
+  %scevgep28 = getelementptr %union.vect32* %s_odd, i32 1
+  %scevgep32 = getelementptr %union.vect32* %s_even, i32 1
+  %scevgep36 = getelementptr i8* %scr_s_odd_code_ptr, i32 1
+  %scevgep39 = getelementptr i8* %scr_s_even_code_ptr, i32 1
+  br label %for.body
+
+for.body:                                         ; preds = %for.body, %entry
+  %lsr.iv42 = phi i32 [ %lsr.iv.next, %for.body ], [ 2, %entry ]
+  %lsr.iv40 = phi i8* [ %scevgep41, %for.body ], [ %scevgep39, %entry ]
+  %lsr.iv37 = phi i8* [ %scevgep38, %for.body ], [ %scevgep36, %entry ]
+  %lsr.iv33 = phi %union.vect32* [ %scevgep34, %for.body ], [ %scevgep32, %entry ]
+  %lsr.iv29 = phi %union.vect32* [ %scevgep30, %for.body ], [ %scevgep28, %entry ]
+  %lsr.iv = phi %union.vect64* [ %scevgep26, %for.body ], [ %scevgep, %entry ]
+  %predicate_1.023 = phi i8 [ undef, %entry ], [ %10, %for.body ]
+  %predicate.022 = phi i8 [ undef, %entry ], [ %9, %for.body ]
+  %val.021 = phi i64 [ undef, %entry ], [ %srcval, %for.body ]
+  %lsr.iv3335 = bitcast %union.vect32* %lsr.iv33 to i32*
+  %lsr.iv2931 = bitcast %union.vect32* %lsr.iv29 to i32*
+  %lsr.iv27 = bitcast %union.vect64* %lsr.iv to i64*
+  %0 = tail call i64 @llvm.hexagon.A2.vsubhs(i64 0, i64 %val.021)
+  %conv3 = sext i8 %predicate.022 to i32
+  %1 = trunc i64 %val.021 to i32
+  %2 = trunc i64 %0 to i32
+  %3 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv3, i32 %1, i32 %2)
+  store i32 %3, i32* %lsr.iv3335, align 4, !tbaa !0
+  %conv8 = sext i8 %predicate_1.023 to i32
+  %4 = lshr i64 %val.021, 32
+  %5 = trunc i64 %4 to i32
+  %6 = lshr i64 %0, 32
+  %7 = trunc i64 %6 to i32
+  %8 = tail call i32 @llvm.hexagon.C2.mux(i32 %conv8, i32 %5, i32 %7)
+  store i32 %8, i32* %lsr.iv2931, align 4, !tbaa !0
+  %srcval = load i64* %lsr.iv27, align 8
+  %9 = load i8* %lsr.iv40, align 1, !tbaa !1
+  %10 = load i8* %lsr.iv37, align 1, !tbaa !1
+  %lftr.wideiv = trunc i32 %lsr.iv42 to i8
+  %exitcond = icmp eq i8 %lftr.wideiv, 32
+  %scevgep26 = getelementptr %union.vect64* %lsr.iv, i32 1
+  %scevgep30 = getelementptr %union.vect32* %lsr.iv29, i32 1
+  %scevgep34 = getelementptr %union.vect32* %lsr.iv33, i32 1
+  %scevgep38 = getelementptr i8* %lsr.iv37, i32 1
+  %scevgep41 = getelementptr i8* %lsr.iv40, i32 1
+  %lsr.iv.next = add i32 %lsr.iv42, 1
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body
+  ret void
+}
+
+declare i64 @llvm.hexagon.A2.vsubhs(i64, i64) nounwind readnone
+
+declare i32 @llvm.hexagon.C2.mux(i32, i32, i32) nounwind readnone
+
+!0 = metadata !{metadata !"long", metadata !1}
+!1 = metadata !{metadata !"omnipotent char", metadata !2}
+!2 = metadata !{metadata !"Simple C/C++ TBAA", null}
diff --git a/test/CodeGen/Hexagon/static.ll b/test/CodeGen/Hexagon/static.ll
index 2e4ab63..683a4c2 100644
--- a/test/CodeGen/Hexagon/static.ll
+++ b/test/CodeGen/Hexagon/static.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched < %s | FileCheck %s
+; RUN: llc -march=hexagon -mcpu=hexagonv4 -disable-dfa-sched -disable-hexagon-misched < %s | FileCheck %s
 
 @num = external global i32
 @acc = external global i32
diff --git a/test/CodeGen/Mips/brdelayslot.ll b/test/CodeGen/Mips/brdelayslot.ll
index b266ce6..1b2fbc8 100644
--- a/test/CodeGen/Mips/brdelayslot.ll
+++ b/test/CodeGen/Mips/brdelayslot.ll
@@ -1,12 +1,18 @@
-; RUN: llc -march=mipsel  -enable-mips-delay-filler < %s | FileCheck %s
+; RUN: llc -march=mipsel -O0 < %s | FileCheck %s -check-prefix=None
+; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=Default
 
 define void @foo1() nounwind {
 entry:
-; CHECK:      jalr 
-; CHECK-NOT:  nop 
-; CHECK:      jr 
-; CHECK-NOT:  nop
-; CHECK:      .end
+; Default:     jalr 
+; Default-NOT: nop 
+; Default:     jr 
+; Default-NOT: nop
+; Default:     .end
+; None: jalr 
+; None: nop 
+; None: jr 
+; None: nop
+; None: .end
 
   tail call void @foo2(i32 3) nounwind
   ret void
diff --git a/test/CodeGen/Mips/init-array.ll b/test/CodeGen/Mips/init-array.ll
new file mode 100644
index 0000000..f96ce26
--- /dev/null
+++ b/test/CodeGen/Mips/init-array.ll
@@ -0,0 +1,14 @@
+; RUN: llc -mtriple mipsel-unknown-linux -use-init-array < %s | FileCheck  %s
+
+target triple = "mipsel-unknown-linux"
+
+@llvm.global_ctors = appending global [1 x { i32, void ()* }] [{ i32, void ()* } { i32 65535, void ()* @test }]
+; CHECK: .section
+; CHECK: .init_array
+; CHECK-NOT: .ctors
+; CHECK: .4byte test
+
+define internal void @test() section ".text.startup" {
+entry:
+  ret void
+}
diff --git a/test/CodeGen/Mips/longbranch.ll b/test/CodeGen/Mips/longbranch.ll
index 0227b88..873b9f1 100644
--- a/test/CodeGen/Mips/longbranch.ll
+++ b/test/CodeGen/Mips/longbranch.ll
@@ -6,9 +6,15 @@
 define void @foo1(i32 %s) nounwind {
 entry:
 ; O32: bal
+; O32: lui $at, 0
+; O32: addiu $at, $at, {{[0-9]+}} 
+; N64: lui $at, 0
+; N64: daddiu $at, $at, 0
+; N64: dsll $at, $at, 16
+; N64: daddiu $at, $at, 0
 ; N64: bal
-; N64: highest
-; N64: higher
+; N64: dsll $at, $at, 16
+; N64: daddiu $at, $at, {{[0-9]+}}  
 
   %tobool = icmp eq i32 %s, 0
   br i1 %tobool, label %if.end, label %if.then
diff --git a/test/CodeGen/Mips/small-section-reserve-gp.ll b/test/CodeGen/Mips/small-section-reserve-gp.ll
new file mode 100644
index 0000000..03503fb
--- /dev/null
+++ b/test/CodeGen/Mips/small-section-reserve-gp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -mtriple=mipsel-sde-elf -march=mipsel -relocation-model=static < %s \
+; RUN: | FileCheck %s
+
+@i = internal unnamed_addr global i32 0, align 4
+
+define i32 @geti() nounwind readonly {
+entry:
+; CHECK: lw ${{[0-9]+}}, %gp_rel(i)($gp)
+  %0 = load i32* @i, align 4
+  ret i32 %0
+}
+
diff --git a/test/CodeGen/Mips/tls-alias.ll b/test/CodeGen/Mips/tls-alias.ll
index d681091..ce98cc8 100644
--- a/test/CodeGen/Mips/tls-alias.ll
+++ b/test/CodeGen/Mips/tls-alias.ll
@@ -1,4 +1,4 @@
-; RUN: llc -march=mipsel -relocation-model=pic < %s | FileCheck %s
+; RUN: llc -march=mipsel -relocation-model=pic -disable-mips-delay-filler < %s | FileCheck %s
 
 @foo = thread_local global i32 42
 @bar = hidden alias i32* @foo
diff --git a/test/CodeGen/Mips/tls.ll b/test/CodeGen/Mips/tls.ll
index a7ddb96..72d30dc 100644
--- a/test/CodeGen/Mips/tls.ll
+++ b/test/CodeGen/Mips/tls.ll
@@ -1,8 +1,10 @@
-; RUN: llc -march=mipsel < %s | FileCheck %s -check-prefix=PIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:                             | FileCheck %s -check-prefix=STATIC
-; RUN: llc -march=mipsel -relocation-model=static < %s \
-; RUN:   -mips-fix-global-base-reg=false | FileCheck %s -check-prefix=STATICGP
+; RUN: llc -march=mipsel -disable-mips-delay-filler < %s | \
+; RUN:     FileCheck %s -check-prefix=PIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler < \
+; RUN:     %s | FileCheck %s -check-prefix=STATIC
+; RUN: llc -march=mipsel -relocation-model=static -disable-mips-delay-filler \
+; RUN:     -mips-fix-global-base-reg=false < %s  | \
+; RUN:     FileCheck %s -check-prefix=STATICGP
 
 @t1 = thread_local global i32 0, align 4
 
diff --git a/test/CodeGen/Mips/uitofp.ll b/test/CodeGen/Mips/uitofp.ll
new file mode 100644
index 0000000..aff70c2
--- /dev/null
+++ b/test/CodeGen/Mips/uitofp.ll
@@ -0,0 +1,12 @@
+; RUN: llc -march=mips -mattr=+single-float < %s
+
+define void @f0() nounwind {
+entry:
+  %b = alloca i32, align 4
+  %a = alloca float, align 4
+  store volatile i32 1, i32* %b, align 4
+  %0 = load volatile i32* %b, align 4
+  %conv = uitofp i32 %0 to float
+  store float %conv, float* %a, align 4
+  ret void
+}
diff --git a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
index 0003a17..b95ac68 100644
--- a/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
+++ b/test/CodeGen/PowerPC/2010-03-09-indirect-call.ll
@@ -9,9 +9,8 @@ target triple = "powerpc-apple-darwin11.0"
 
 define void @foo() nounwind ssp {
 entry:
-; Better: mtctr r12
-; CHECK: mr r12, [[REG:r[0-9]+]]
-; CHECK: mtctr [[REG]]
+; CHECK: mtctr r12
+; CHECK: bctrl
   %0 = load void (...)** @p, align 4              ; <void (...)*> [#uses=1]
   call void (...)* %0() nounwind
   br label %return
diff --git a/test/CodeGen/PowerPC/big-endian-formal-args.ll b/test/CodeGen/PowerPC/big-endian-formal-args.ll
index 9a456b6..638059a 100644
--- a/test/CodeGen/PowerPC/big-endian-formal-args.ll
+++ b/test/CodeGen/PowerPC/big-endian-formal-args.ll
@@ -2,10 +2,10 @@
 
 declare void @bar(i64 %x, i64 %y)
 
-; CHECK: li {{[53]}}, 0
+; CHECK: li 3, 0
 ; CHECK: li 4, 2
+; CHECK: li 5, 0
 ; CHECK: li 6, 3
-; CHECK: mr {{[53]}}, {{[53]}}
 
 define void @foo() {
   call void @bar(i64 2, i64 3)
diff --git a/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
new file mode 100644
index 0000000..afa1ea8
--- /dev/null
+++ b/test/CodeGen/PowerPC/cr1eq-no-extra-moves.ll
@@ -0,0 +1,26 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [3 x i8] c"%i\00", align 1
+
+define void @test(i32 %count) nounwind {
+entry:
+; CHECK: crxor 6, 6, 6
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %cmp2 = icmp sgt i32 %count, 0
+  br i1 %cmp2, label %for.body, label %for.end
+
+for.body:                                         ; preds = %entry, %for.body
+  %i.03 = phi i32 [ %inc, %for.body ], [ 0, %entry ]
+; CHECK: crxor 6, 6, 6
+  %call1 = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([3 x i8]* @.str, i32 0, i32 0), i32 1) nounwind
+  %inc = add nsw i32 %i.03, 1
+  %exitcond = icmp eq i32 %inc, %count
+  br i1 %exitcond, label %for.end, label %for.body
+
+for.end:                                          ; preds = %for.body, %entry
+  ret void
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/PowerPC/fsl-e500mc.ll b/test/CodeGen/PowerPC/fsl-e500mc.ll
new file mode 100644
index 0000000..09b7e41
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e500mc.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e500mc and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e500mc < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-fsl-linux"
+
+%struct.teststruct = type { [12 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i32(i8* %0, i8* %1, i32 52, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i32(i8* nocapture, i8* nocapture, i32, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/fsl-e5500.ll b/test/CodeGen/PowerPC/fsl-e5500.ll
new file mode 100644
index 0000000..d47d8c8
--- /dev/null
+++ b/test/CodeGen/PowerPC/fsl-e5500.ll
@@ -0,0 +1,22 @@
+;
+; Test support for Freescale e5500 and its higher memcpy inlining thresholds.
+;
+; RUN: llc -mcpu=e5500 < %s 2>&1 | FileCheck %s
+; CHECK-NOT: not a recognized processor for this target
+
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-fsl-linux"
+
+%struct.teststruct = type { [24 x i32], i32 }
+
+define void @copy(%struct.teststruct* noalias nocapture sret %agg.result, %struct.teststruct* nocapture %in) nounwind {
+entry:
+; CHECK: @copy
+; CHECK-NOT: bl memcpy
+  %0 = bitcast %struct.teststruct* %agg.result to i8*
+  %1 = bitcast %struct.teststruct* %in to i8*
+  tail call void @llvm.memcpy.p0i8.p0i8.i64(i8* %0, i8* %1, i64 100, i32 4, i1 false)
+  ret void
+}
+
+declare void @llvm.memcpy.p0i8.p0i8.i64(i8* nocapture, i8* nocapture, i64, i32, i1) nounwind
diff --git a/test/CodeGen/PowerPC/inlineasm-copy.ll b/test/CodeGen/PowerPC/inlineasm-copy.ll
index e1ff82d..59c3388 100644
--- a/test/CodeGen/PowerPC/inlineasm-copy.ll
+++ b/test/CodeGen/PowerPC/inlineasm-copy.ll
@@ -1,5 +1,6 @@
-; RUN: llc < %s -march=ppc32 | not grep mr
+; RUN: llc < %s -march=ppc32 -verify-machineinstrs | FileCheck %s
 
+; CHECK-NOT: mr
 define i32 @test(i32 %Y, i32 %X) {
 entry:
         %tmp = tail call i32 asm "foo $0", "=r"( )              ; <i32> [#uses=1]
@@ -12,3 +13,9 @@ entry:
         ret i32 %tmp1
 }
 
+; CHECK: test3
+define i32 @test3(i32 %Y, i32 %X) {
+entry:
+        %tmp1 = tail call { i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32, i32 } asm sideeffect "foo $0, $1", "=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,=r,0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15,16,17,18,19"( i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y, i32 %X, i32 %Y )                ; <i32> [#uses=1]
+       ret i32 1
+}
diff --git a/test/CodeGen/PowerPC/ppc64-toc.ll b/test/CodeGen/PowerPC/ppc64-toc.ll
new file mode 100644
index 0000000..f1326ba
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-toc.ll
@@ -0,0 +1,67 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+@double_array = global [32 x double] zeroinitializer, align 8
+@number64 = global i64 10, align 8
+@internal_static_var.x = internal unnamed_addr global i64 0, align 8
+
+define i64 @access_int64(i64 %a) nounwind readonly {
+entry:
+; CHECK: access_int64:
+; CHECK-NEXT: .align  3
+; CHECK-NEXT: .quad   .L.access_int64
+; CHECK-NEXT: .quad   .TOC.@tocbase
+; CHECK-NEXT: .text
+  %0 = load i64* @number64, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1
+}
+
+define i64 @internal_static_var(i64 %a) nounwind {
+entry:
+; CHECK: internal_static_var:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %0 = load i64* @internal_static_var.x, align 8
+  %cmp = icmp eq i64 %0, %a
+  %conv1 = zext i1 %cmp to i64 
+  ret i64 %conv1 
+}
+
+define i32 @access_double(double %a) nounwind readnone {
+entry:
+; CHECK: access_double:
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %a, 2.000000e+00
+  %conv = zext i1 %cmp to i32 
+  ret i32 %conv
+}
+
+
+define i32 @access_double_array(double %a, i32 %i) nounwind readonly {
+entry:
+; CHECK: access_double_array:
+  %idxprom = sext i32 %i to i64
+  %arrayidx = getelementptr inbounds [32 x double]* @double_array, i64 0, i64 %idxprom
+  %0 = load double* %arrayidx, align 8
+; CHECK: ld {{[0-9]+}}, .LC{{[0-9]+}}@toc(2)
+  %cmp = fcmp oeq double %0, %a
+  %conv = zext i1 %cmp to i32
+  ret i32 %conv
+}
+
+; Check the creation of 4 .tc entries:
+; * int64_t global 'number64'
+; * double constant 2.0
+; * double array 'double_array'
+; * static int64_t 'x' accessed within '@internal_static_var'
+; CHECK: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
+; CHECK-NEXT: .LC{{[0-9]+}}:
+; CHECK-NEXT: .tc {{[\._a-zA-Z0-9]+}}[TC],{{[\._a-zA-Z0-9]+}}
diff --git a/test/CodeGen/PowerPC/ppc64-zext.ll b/test/CodeGen/PowerPC/ppc64-zext.ll
new file mode 100644
index 0000000..eb55445
--- /dev/null
+++ b/test/CodeGen/PowerPC/ppc64-zext.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux"
+
+define i64 @fun(i32 %arg32) nounwind {
+entry:
+; CHECK: rldicl {{[0-9]+}}, {{[0-9]+}}, 0, 32
+  %o = zext i32 %arg32 to i64
+  ret i64 %o
+}
+
diff --git a/test/CodeGen/PowerPC/pr13641.ll b/test/CodeGen/PowerPC/pr13641.ll
new file mode 100644
index 0000000..c4d3f3a
--- /dev/null
+++ b/test/CodeGen/PowerPC/pr13641.ll
@@ -0,0 +1,11 @@
+; RUN: llc < %s | FileCheck %s
+target datalayout = "E-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32:64"
+target triple = "powerpc64-unknown-linux-gnu"
+
+define void @foo() nounwind {
+  ret void
+}
+
+; CHECK: blr
+; CHECK-NEXT: .long 0
+; CHECK-NEXT: .quad 0
diff --git a/test/CodeGen/PowerPC/remat-imm.ll b/test/CodeGen/PowerPC/remat-imm.ll
new file mode 100644
index 0000000..520921f
--- /dev/null
+++ b/test/CodeGen/PowerPC/remat-imm.ll
@@ -0,0 +1,16 @@
+; RUN: llc < %s | FileCheck %s
+; ModuleID = 'test.c'
+target datalayout = "E-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v128:128:128-n32"
+target triple = "powerpc-unknown-linux"
+
+@.str = private unnamed_addr constant [6 x i8] c"%d,%d\00", align 1
+
+define i32 @main() nounwind {
+entry:
+; CHECK: li 4, 128
+; CHECK-NOT: mr 4, {{.*}}
+  %call = tail call i32 (i8*, ...)* @printf(i8* getelementptr inbounds ([6 x i8]* @.str, i32 0, i32 0), i32 128, i32 128) nounwind
+  ret i32 0
+}
+
+declare i32 @printf(i8* nocapture, ...) nounwind
diff --git a/test/CodeGen/Thumb/thumb_jump24_fixup.ll b/test/CodeGen/Thumb/thumb_jump24_fixup.ll
new file mode 100644
index 0000000..e6a6b25
--- /dev/null
+++ b/test/CodeGen/Thumb/thumb_jump24_fixup.ll
@@ -0,0 +1,23 @@
+; RUN: llc -mtriple thumbv7-none-linux-gnueabi -mcpu=cortex-a8 -march=thumb -mattr=thumb2 -filetype=obj -o - < %s | llvm-objdump -r - | FileCheck %s
+
+target datalayout = "e-p:32:32:32-i1:8:32-i8:8:32-i16:16:32-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:64:128-a0:0:32-n32-S64"
+target triple = "thumbv7-none-linux-gnueabi"
+
+define i32 @test_fixup_t2_uncondbranch() {
+b0:
+  invoke void @__cxa_throw(i8* null, i8* null, i8* null) noreturn
+    to label %unreachable unwind label %lpad
+
+; CHECK: {{[0-9]+}} R_ARM_THM_JUMP24 __cxa_throw
+
+lpad:
+  %0 = landingpad { i8*, i32 } personality i8* bitcast (i32 (...)* @__gxx_personality_v0 to i8*) cleanup
+  ret i32 0
+
+unreachable:
+  unreachable
+}
+
+declare i32 @__gxx_personality_v0(...)
+
+declare void @__cxa_throw(i8*, i8*, i8*)
diff --git a/test/CodeGen/Thumb2/longMACt.ll b/test/CodeGen/Thumb2/longMACt.ll
new file mode 100644
index 0000000..beefd60
--- /dev/null
+++ b/test/CodeGen/Thumb2/longMACt.ll
@@ -0,0 +1,44 @@
+; RUN: llc < %s -march=thumb -mattr=+thumb2 | FileCheck %s
+; Check generated signed and unsigned multiply accumulate long.
+
+define i64 @MACLongTest1(i32 %a, i32 %b, i64 %c) {
+;CHECK: MACLongTest1:
+;CHECK: umlal
+  %conv = zext i32 %a to i64
+  %conv1 = zext i32 %b to i64
+  %mul = mul i64 %conv1, %conv
+  %add = add i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest2(i32 %a, i32 %b, i64 %c)  {
+;CHECK: MACLongTest2:
+;CHECK: smlal
+  %conv = sext i32 %a to i64
+  %conv1 = sext i32 %b to i64
+  %mul = mul nsw i64 %conv1, %conv
+  %add = add nsw i64 %mul, %c
+  ret i64 %add
+}
+
+define i64 @MACLongTest3(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest3:
+;CHECK: umlal
+  %conv = zext i32 %b to i64
+  %conv1 = zext i32 %a to i64
+  %mul = mul i64 %conv, %conv1
+  %conv2 = zext i32 %c to i64
+  %add = add i64 %mul, %conv2
+  ret i64 %add
+}
+
+define i64 @MACLongTest4(i32 %a, i32 %b, i32 %c) {
+;CHECK: MACLongTest4:
+;CHECK: smlal
+  %conv = sext i32 %b to i64
+  %conv1 = sext i32 %a to i64
+  %mul = mul nsw i64 %conv, %conv1
+  %conv2 = sext i32 %c to i64
+  %add = add nsw i64 %mul, %conv2
+  ret i64 %add
+}
diff --git a/test/CodeGen/Thumb2/thumb2-select_xform.ll b/test/CodeGen/Thumb2/thumb2-select_xform.ll
index ead198f..ed4d26d 100644
--- a/test/CodeGen/Thumb2/thumb2-select_xform.ll
+++ b/test/CodeGen/Thumb2/thumb2-select_xform.ll
@@ -5,7 +5,7 @@ define i32 @t1(i32 %a, i32 %b, i32 %c) nounwind {
 ; CHECK: mvn r0, #-2147483648
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: addle.w r1, r1, r0
+; CHECK: addle r1, r0
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 2147483647
@@ -30,7 +30,7 @@ define i32 @t3(i32 %a, i32 %b, i32 %c, i32 %d) nounwind {
 ; CHECK: t3
 ; CHECK: cmp r2, #10
 ; CHECK: it  le
-; CHECK: suble.w r1, r1, #10
+; CHECK: suble r1, #10
 ; CHECK: mov r0, r1
         %tmp1 = icmp sgt i32 %c, 10
         %tmp2 = select i1 %tmp1, i32 0, i32 10
diff --git a/test/CodeGen/X86/2012-04-26-sdglue.ll b/test/CodeGen/X86/2012-04-26-sdglue.ll
index 9a66b67..0465952 100644
--- a/test/CodeGen/X86/2012-04-26-sdglue.ll
+++ b/test/CodeGen/X86/2012-04-26-sdglue.ll
@@ -5,7 +5,7 @@
 ; It's hard to test for the ISEL condition because CodeGen optimizes
 ; away the bugpointed code. Just ensure the basics are still there.
 ;CHECK: func:
-;CHECK: vpxor
+;CHECK: vxorps
 ;CHECK: vinsertf128
 ;CHECK: vpshufd
 ;CHECK: vpshufd
diff --git a/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
new file mode 100644
index 0000000..6ebbb2e
--- /dev/null
+++ b/test/CodeGen/X86/2012-08-28-UnsafeMathCrash.ll
@@ -0,0 +1,20 @@
+; RUN: llc < %s -enable-unsafe-fp-math
+; <rdar://problem/12180135>
+target datalayout = "e-p:32:32:32-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:32:64-f32:32:32-f64:32:64-v64:64:64-v128:128:128-a0:0:64-f80:128:128-n8:16:32-S128"
+target triple = "i386-apple-macosx10.8.0"
+
+define i32 @foo(float %mean) nounwind readnone ssp align 2 {
+entry:
+  %cmp = fcmp olt float %mean, -3.000000e+00
+  %f.0 = select i1 %cmp, float -3.000000e+00, float %mean
+  %cmp2 = fcmp ult float %f.0, 3.000000e+00
+  %f.1 = select i1 %cmp2, float %f.0, float 0x4007EB8520000000
+  %add = fadd float %f.1, 3.000000e+00
+  %div = fdiv float %add, 2.343750e-02
+  %0 = fpext float %div to double
+  %conv = select i1 undef, double 2.550000e+02, double %0
+  %add8 = fadd double %conv, 5.000000e-01
+  %conv9 = fptosi double %add8 to i32
+  %.conv9 = select i1 undef, i32 255, i32 %conv9
+  ret i32 %.conv9
+}
diff --git a/test/CodeGen/X86/StackColoring.ll b/test/CodeGen/X86/StackColoring.ll
new file mode 100644
index 0000000..0f36ce2
--- /dev/null
+++ b/test/CodeGen/X86/StackColoring.ll
@@ -0,0 +1,362 @@
+; RUN: llc -mcpu=corei7 -no-stack-coloring=false < %s | FileCheck %s --check-prefix=YESCOLOR
+; RUN: llc -mcpu=corei7 -no-stack-coloring=true  < %s | FileCheck %s --check-prefix=NOCOLOR
+
+target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
+target triple = "x86_64-apple-macosx10.8.0"
+
+;YESCOLOR: subq  $136, %rsp
+;NOCOLOR: subq  $264, %rsp
+
+
+define i32 @myCall_w2(i32 %in) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_no_merge(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 %t7
+bb3:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall2_w2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+;YESCOLOR: subq  $208, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+
+
+
+define i32 @myCall_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+}
+
+;YESCOLOR: subq  $112, %rsp
+;NOCOLOR: subq  $400, %rsp
+
+define i32 @myCall2_w4(i32 %in) {
+entry:
+  %a1 = alloca [14 x i8*], align 8
+  %a2 = alloca [13 x i8*], align 8
+  %a3 = alloca [12 x i8*], align 8
+  %a4 = alloca [11 x i8*], align 8
+  %b1 = bitcast [14 x i8*]* %a1 to i8*
+  %b2 = bitcast [13 x i8*]* %a2 to i8*
+  %b3 = bitcast [12 x i8*]* %a3 to i8*
+  %b4 = bitcast [11 x i8*]* %a4 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b1)
+  %t1 = call i32 @foo(i32 %in, i8* %b1)
+  %t2 = call i32 @foo(i32 %in, i8* %b1)
+  call void @llvm.lifetime.end(i64 -1, i8* %b1)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t9 = call i32 @foo(i32 %in, i8* %b2)
+  %t8 = call i32 @foo(i32 %in, i8* %b2)
+  call void @llvm.lifetime.end(i64 -1, i8* %b2)
+  call void @llvm.lifetime.start(i64 -1, i8* %b3)
+  %t3 = call i32 @foo(i32 %in, i8* %b3)
+  %t4 = call i32 @foo(i32 %in, i8* %b3)
+  call void @llvm.lifetime.end(i64 -1, i8* %b3)
+  br i1 undef, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b4)
+  %t11 = call i32 @foo(i32 %in, i8* %b4)
+  call void @llvm.lifetime.end(i64 -1, i8* %b4)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+
+define i32 @myCall2_noend(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_noend2(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: subq  $144, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @myCall2_nostart(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+; Adopt the test from Transforms/Inline/array_merge.ll'
+;YESCOLOR: subq  $816, %rsp
+;NOCOLOR: subq  $1616, %rsp
+define void @array_merge() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+define i32 @func_phi_lifetime(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  br i1 %d, label %bb0, label %bb1
+
+bb0:
+  %I1 = bitcast [17 x i8*]* %a to i8*
+  br label %bb2
+
+bb1:
+  %I2 = bitcast [16 x i8*]* %a2 to i8*
+  br label %bb2
+
+bb2:
+  %split = phi i8* [ %I1, %bb0 ], [ %I2, %bb1 ]
+  call void @llvm.lifetime.start(i64 -1, i8* %split)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  call void @llvm.lifetime.end(i64 -1, i8* %split)
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+
+;YESCOLOR: multi_region_bb
+;NOCOLOR: multi_region_bb
+define void @multi_region_bb() nounwind ssp {
+entry:
+  %A.i1 = alloca [100 x i32], align 4
+  %B.i2 = alloca [100 x i32], align 4
+  %A.i = alloca [100 x i32], align 4
+  %B.i = alloca [100 x i32], align 4
+  %0 = bitcast [100 x i32]* %A.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind ; <---- start #1
+  %1 = bitcast [100 x i32]* %B.i to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %1) nounwind
+  call void @bar([100 x i32]* %A.i, [100 x i32]* %B.i) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %1) nounwind
+  %2 = bitcast [100 x i32]* %A.i1 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %2) nounwind
+  %3 = bitcast [100 x i32]* %B.i2 to i8*
+  call void @llvm.lifetime.start(i64 -1, i8* %3) nounwind
+  call void @llvm.lifetime.start(i64 -1, i8* %0) nounwind  ; <---- start #2
+  call void @bar([100 x i32]* %A.i1, [100 x i32]* %B.i2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %2) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %0) nounwind
+  call void @llvm.lifetime.end(i64 -1, i8* %3) nounwind
+  ret void
+}
+
+
+;YESCOLOR: subq  $272, %rsp
+;NOCOLOR: subq  $272, %rsp
+
+define i32 @myCall_end_before_begin(i32 %in, i1 %d) {
+entry:
+  %a = alloca [17 x i8*], align 8
+  %a2 = alloca [16 x i8*], align 8
+  %b = bitcast [17 x i8*]* %a to i8*
+  %b2 = bitcast [16 x i8*]* %a2 to i8*
+  %t1 = call i32 @foo(i32 %in, i8* %b)
+  %t2 = call i32 @foo(i32 %in, i8* %b)
+  call void @llvm.lifetime.end(i64 -1, i8* %b)
+  call void @llvm.lifetime.start(i64 -1, i8* %b)
+  br i1 %d, label %bb2, label %bb3
+bb2:
+  call void @llvm.lifetime.start(i64 -1, i8* %b2)
+  %t3 = call i32 @foo(i32 %in, i8* %b2)
+  %t4 = call i32 @foo(i32 %in, i8* %b2)
+  %t5 = add i32 %t1, %t2
+  %t6 = add i32 %t3, %t4
+  %t7 = add i32 %t5, %t6
+  ret i32 %t7
+bb3:
+  ret i32 0
+}
+
+declare void @bar([100 x i32]* , [100 x i32]*) nounwind
+
+declare void @llvm.lifetime.start(i64, i8* nocapture) nounwind
+
+declare void @llvm.lifetime.end(i64, i8* nocapture) nounwind
+
+ declare i32 @foo(i32, i8*)
+
diff --git a/test/CodeGen/X86/atom-bypass-slow-division.ll b/test/CodeGen/X86/atom-bypass-slow-division.ll
new file mode 100644
index 0000000..e7c9605
--- /dev/null
+++ b/test/CodeGen/X86/atom-bypass-slow-division.ll
@@ -0,0 +1,112 @@
+; RUN: llc < %s -mcpu=atom -mtriple=i686-linux  | FileCheck %s
+
+define i32 @test_get_quotient(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = sdiv i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: ret
+; CHECK: divb
+; CHECK: ret
+  %result = srem i32 %a, %b
+  ret i32 %result
+}
+
+define i32 @test_get_quotient_and_remainder(i32 %a, i32 %b) nounwind {
+; CHECK: test_get_quotient_and_remainder
+; CHECK: orl %ecx, %edx
+; CHECK-NEXT: testl $-256, %edx
+; CHECK-NEXT: je
+; CHECK: idivl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+; CEECK-NOT: idivl
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, %b
+  %resultrem = srem i32 %a, %b
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_and_idiv(i32 %a, i32 %b) nounwind {
+; CHECK: test_use_div_and_idiv
+; CHECK: idivl
+; CHECK: divb
+; CHECK: divl
+; CHECK: divb
+; CHECK: addl
+; CHECK: ret
+  %resultidiv = sdiv i32 %a, %b
+  %resultdiv = udiv i32 %a, %b
+  %result = add i32 %resultidiv, %resultdiv
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_imm() nounwind {
+; CHECK: test_use_div_imm_imm
+; CHECK: movl $64
+  %resultdiv = sdiv i32 256, 4
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_div_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_div_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_rem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultrem = srem i32 %a, 33
+  ret i32 %resultrem
+}
+
+define i32 @test_use_divrem_reg_imm(i32 %a) nounwind {
+; CHECK: test_use_divrem_reg_imm
+; CEHCK-NOT: test
+; CHECK-NOT: idiv
+; CHECK-NOT: divb
+  %resultdiv = sdiv i32 %a, 33
+  %resultrem = srem i32 %a, 33
+  %result = add i32 %resultdiv, %resultrem
+  ret i32 %result
+}
+
+define i32 @test_use_div_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_div_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
+
+define i32 @test_use_rem_imm_reg(i32 %a) nounwind {
+; CHECK: test_use_rem_imm_reg
+; CHECK: test
+; CHECK: idiv
+; CHECK: divb
+  %resultdiv = sdiv i32 4, %a
+  ret i32 %resultdiv
+}
diff --git a/test/CodeGen/X86/avx-basic.ll b/test/CodeGen/X86/avx-basic.ll
index 8ad0fa8..95854c7 100644
--- a/test/CodeGen/X86/avx-basic.ll
+++ b/test/CodeGen/X86/avx-basic.ll
@@ -109,8 +109,8 @@ allocas:
 ; rdar://10566486
 ; CHECK: fneg
 ; CHECK: vxorps
-define <16 x float> @fneg(<16 x float> addrspace(1)* nocapture %out) nounwind {
-  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+define <16 x float> @fneg(<16 x float> %a) nounwind {
+  %1 = fsub <16 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %a
   ret <16 x float> %1
 }
 
diff --git a/test/CodeGen/X86/avx-shuffle.ll b/test/CodeGen/X86/avx-shuffle.ll
index 9b41709..ec11654 100644
--- a/test/CodeGen/X86/avx-shuffle.ll
+++ b/test/CodeGen/X86/avx-shuffle.ll
@@ -229,9 +229,8 @@ define   <8 x float> @test17(<4 x float> %y) {
 }
 
 ; CHECK: test18
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovshdup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15>
@@ -239,9 +238,8 @@ define <8 x float> @test18(<8 x float> %A, <8 x float>%B) nounwind {
 }
 
 ; CHECK: test19
-; CHECK: vshufps
-; CHECK: vshufps
-; CHECK: vunpcklps
+; CHECK: vmovsldup
+; CHECK: vblendps
 ; CHECK: ret
 define <8 x float> @test19(<8 x float> %A, <8 x float>%B) nounwind {
   %S = shufflevector <8 x float> %A, <8 x float> %B, <8 x i32> <i32 0, i32 8, i32 2, i32 10, i32 4, i32 12, i32 6, i32 14>
diff --git a/test/CodeGen/X86/avx-vextractf128.ll b/test/CodeGen/X86/avx-vextractf128.ll
index fe0f6ca..ff56a45 100644
--- a/test/CodeGen/X86/avx-vextractf128.ll
+++ b/test/CodeGen/X86/avx-vextractf128.ll
@@ -19,12 +19,12 @@ entry:
 }
 
 ; CHECK: @t0
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t0(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 1)
   %1 = bitcast float* %addr to <4 x float>*
   store <4 x float> %0, <4 x float>* %1, align 16
   ret void
@@ -32,27 +32,13 @@ entry:
 
 declare <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float>, i8) nounwind readnone
 
-; CHECK: @t1
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t1(float* %addr, <8 x float> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
-  %1 = bitcast float* %addr to i8*
-  tail call void @llvm.x86.sse.storeu.ps(i8* %1, <4 x float> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse.storeu.ps(i8*, <4 x float>) nounwind
-
 ; CHECK: @t2
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t2(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
 entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 1)
   %1 = bitcast double* %addr to <2 x double>*
   store <2 x double> %0, <2 x double>* %1, align 16
   ret void
@@ -60,28 +46,14 @@ entry:
 
 declare <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double>, i8) nounwind readnone
 
-; CHECK: @t3
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovups %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t3(double* %addr, <4 x double> %a) nounwind uwtable ssp {
-entry:
-  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
-  %1 = bitcast double* %addr to i8*
-  tail call void @llvm.x86.sse2.storeu.pd(i8* %1, <2 x double> %0)
-  ret void
-}
-
-declare void @llvm.x86.sse2.storeu.pd(i8*, <2 x double>) nounwind
-
 ; CHECK: @t4
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
+; CHECK-NOT: vextractf128 $1, %ymm0, %xmm0
 ; CHECK-NOT: vmovaps %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
+; CHECK: vextractf128 $1, %ymm0, (%rdi)
 define void @t4(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
-  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 1)
   %2 = bitcast <4 x i32> %1 to <2 x i64>
   store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
@@ -90,17 +62,43 @@ entry:
 declare <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32>, i8) nounwind readnone
 
 ; CHECK: @t5
-; CHECK-NOT: vextractf128 $0, %ymm0, %xmm0
-; CHECK-NOT: vmovdqu %xmm0, (%rdi)
-; CHECK: vextractf128 $0, %ymm0, (%rdi)
-define void @t5(<2 x i64>* %addr, <4 x i64> %a) nounwind uwtable ssp {
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t5(float* nocapture %addr, <8 x float> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <4 x float> @llvm.x86.avx.vextractf128.ps.256(<8 x float> %a, i8 0)
+  %1 = bitcast float* %addr to <4 x float>*
+  store <4 x float> %0, <4 x float>* %1, align 16
+  ret void
+}
+
+; CHECK: @t6
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t6(double* nocapture %addr, <4 x double> %a) nounwind uwtable ssp {
+entry:
+  %0 = tail call <2 x double> @llvm.x86.avx.vextractf128.pd.256(<4 x double> %a, i8 0)
+  %1 = bitcast double* %addr to <2 x double>*
+  store <2 x double> %0, <2 x double>* %1, align 16
+  ret void
+}
+
+; CHECK: @t7
+; CHECK: vmovaps %xmm0, (%rdi)
+define void @t7(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
 entry:
   %0 = bitcast <4 x i64> %a to <8 x i32>
   %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
-  %2 = bitcast <2 x i64>* %addr to i8*
-  %3 = bitcast <4 x i32> %1 to <16 x i8>
-  tail call void @llvm.x86.sse2.storeu.dq(i8* %2, <16 x i8> %3)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 16
   ret void
 }
 
-declare void @llvm.x86.sse2.storeu.dq(i8*, <16 x i8>) nounwind
+; CHECK: @t8
+; CHECK: vmovups %xmm0, (%rdi)
+define void @t8(<2 x i64>* nocapture %addr, <4 x i64> %a) nounwind uwtable ssp {
+entry:
+  %0 = bitcast <4 x i64> %a to <8 x i32>
+  %1 = tail call <4 x i32> @llvm.x86.avx.vextractf128.si.256(<8 x i32> %0, i8 0)
+  %2 = bitcast <4 x i32> %1 to <2 x i64>
+  store <2 x i64> %2, <2 x i64>* %addr, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/avx2-shuffle.ll b/test/CodeGen/X86/avx2-shuffle.ll
index c5899fa..267a806 100644
--- a/test/CodeGen/X86/avx2-shuffle.ll
+++ b/test/CodeGen/X86/avx2-shuffle.ll
@@ -26,3 +26,37 @@ entry:
   %shuffle.i = shufflevector <16 x i16> %src1, <16 x i16> %src1, <16 x i32> <i32 3, i32 undef, i32 1, i32 0, i32 4, i32 5, i32 6, i32 7, i32 11, i32 10, i32 9, i32 8, i32 12, i32 13, i32 14, i32 15>
   ret <16 x i16> %shuffle.i
 }
+
+; CHECK: vpshufb_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> undef, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 19, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+; CHECK: vpshufb1_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb1_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> %a, <32 x i8> zeroinitializer, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
+
+
+; CHECK: vpshufb2_test
+; CHECK; vpshufb {{.*\(%r.*}}, %ymm
+; CHECK: ret
+define <32 x i8> @vpshufb2_test(<32 x i8> %a) nounwind {
+  %S = shufflevector <32 x i8> zeroinitializer, <32 x i8> %a, <32 x i32> <i32 1, i32 9, i32 3, i32 11, i32 5, i32 13, i32 7, i32 15, 
+                                                                i32 1, i32 9, i32 36, i32 11, i32 5, i32 13, i32 7, i32 15,  
+                                                                i32 18, i32 49, i32 30, i32 16, i32 25, i32 23, i32 17, i32 25, 
+                                                                i32 20, i32 19, i32 31, i32 17, i32 23, i32 undef, i32 29, i32 18>
+  ret <32 x i8>%S
+}
diff --git a/test/CodeGen/X86/bool-simplify.ll b/test/CodeGen/X86/bool-simplify.ll
index 0cb9fd9..09eb5d1 100644
--- a/test/CodeGen/X86/bool-simplify.ll
+++ b/test/CodeGen/X86/bool-simplify.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86-64 -mattr=+sse41,-avx,+rdrand | FileCheck %s
 
 define i32 @foo(<2 x i64> %c, i32 %a, i32 %b) {
   %t1 = call i32 @llvm.x86.sse41.ptestz(<2 x i64> %c, <2 x i64> %c)
@@ -39,4 +39,20 @@ define i32 @bax(<2 x i64> %c) {
 ; CHECK: ret
 }
 
+define i32 @rnd(i32 %arg) nounwind uwtable {
+  %1 = tail call { i32, i32 } @llvm.x86.rdrand.32() nounwind
+  %2 = extractvalue { i32, i32 } %1, 0
+  %3 = extractvalue { i32, i32 } %1, 1
+  %4 = icmp eq i32 %3, 0
+  %5 = select i1 %4, i32 0, i32 %arg
+  %6 = add i32 %5, %2
+  ret i32 %6
+; CHECK: rnd
+; CHECK: rdrand
+; CHECK: cmov
+; CHECK-NOT: cmov
+; CHECK: ret
+}
+
 declare i32 @llvm.x86.sse41.ptestz(<2 x i64>, <2 x i64>) nounwind readnone
+declare { i32, i32 } @llvm.x86.rdrand.32() nounwind
diff --git a/test/CodeGen/X86/fast-isel-x86-64.ll b/test/CodeGen/X86/fast-isel-x86-64.ll
index d8f4663..85a70aa 100644
--- a/test/CodeGen/X86/fast-isel-x86-64.ll
+++ b/test/CodeGen/X86/fast-isel-x86-64.ll
@@ -1,4 +1,5 @@
-; RUN: llc < %s  -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=-avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s
+; RUN: llc < %s -mattr=+avx -fast-isel -O0 -regalloc=fast -asm-verbose=0 -fast-isel-abort | FileCheck %s --check-prefix=AVX
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-apple-darwin10.0.0"
@@ -197,6 +198,11 @@ block2:
 ; CHECK: cvtsi2sdq {{.*}} %xmm0
 ; CHECK: movb $1, %al
 ; CHECK: callq _test16callee
+
+; AVX: movabsq $1
+; AVX: vmovsd LCP{{.*}}_{{.*}}(%rip), %xmm0
+; AVX: movb $1, %al
+; AVX: callq _test16callee
   call void (...)* @test16callee(double 1.000000e+00)
   ret void
 }
diff --git a/test/CodeGen/X86/fma.ll b/test/CodeGen/X86/fma.ll
index b0c1d0a..bd3514c 100644
--- a/test/CodeGen/X86/fma.ll
+++ b/test/CodeGen/X86/fma.ll
@@ -1,11 +1,13 @@
-; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma  | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=i386-apple-darwin10               | FileCheck %s --check-prefix=CHECK-FMA-CALL
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma | FileCheck %s --check-prefix=CHECK-FMA-INST
-; RUN: llc < %s -mtriple=x86_64-apple-darwin10             | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=+fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=i386-apple-darwin10  -mattr=-fma,-fma4  | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10 -mattr=+fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -mtriple=x86_64-apple-darwin10  -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma4  | FileCheck %s --check-prefix=CHECK-FMA-INST
+; RUN: llc < %s -march=x86 -mcpu=bdver2 -mattr=-fma,-fma4 | FileCheck %s --check-prefix=CHECK-FMA-CALL
 
 ; CHECK: test_f32
 ; CHECK-FMA-INST: vfmadd213ss
-; CHECK-FMA-CALL: _fmaf
+; CHECK-FMA-CALL: fmaf
 
 define float @test_f32(float %a, float %b, float %c) nounwind readnone ssp {
 entry:
@@ -15,7 +17,7 @@ entry:
 
 ; CHECK: test_f64
 ; CHECK-FMA-INST: vfmadd213sd
-; CHECK-FMA-CALL: _fma
+; CHECK-FMA-CALL: fma
 
 define double @test_f64(double %a, double %b, double %c) nounwind readnone ssp {
 entry:
@@ -24,7 +26,7 @@ entry:
 }
 
 ; CHECK: test_f80
-; CHECK: _fmal
+; CHECK: fmal
 
 define x86_fp80 @test_f80(x86_fp80 %a, x86_fp80 %b, x86_fp80 %c) nounwind readnone ssp {
 entry:
diff --git a/test/CodeGen/X86/fma3-intrinsics.ll b/test/CodeGen/X86/fma3-intrinsics.ll
index 90529e0..e3910a6 100755
--- a/test/CodeGen/X86/fma3-intrinsics.ll
+++ b/test/CodeGen/X86/fma3-intrinsics.ll
@@ -1,4 +1,6 @@
-; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 -mattr=avx2,+fma | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mcpu=core-avx2 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-pc-win32 -mattr=+fma,+fma4 | FileCheck %s
+; RUN: llc < %s -mcpu=bdver2 -mtriple=x86_64-pc-win32 -mattr=-fma4 | FileCheck %s
 
 define <4 x float> @test_x86_fmadd_ss(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   ; CHECK: fmadd213ss %xmm
diff --git a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
index fd414b3..2fe1ecd 100644
--- a/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
+++ b/test/CodeGen/X86/fma4-intrinsics-x86_64.ll
@@ -1,4 +1,5 @@
 ; RUN: llc < %s -mtriple=x86_64-unknown-unknown -march=x86-64 -mattr=+avx,+fma4 | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=+avx,-fma | FileCheck %s
 
 ; VFMADD
 define < 4 x float > @test_x86_fma_vfmadd_ss(< 4 x float > %a0, < 4 x float > %a1, < 4 x float > %a2) {
diff --git a/test/CodeGen/X86/fma_patterns.ll b/test/CodeGen/X86/fma_patterns.ll
index 5d97a87..6d98d59 100644
--- a/test/CodeGen/X86/fma_patterns.ll
+++ b/test/CodeGen/X86/fma_patterns.ll
@@ -1,8 +1,13 @@
 ; RUN: llc < %s -mtriple=x86_64-apple-darwin -mcpu=core-avx2 -mattr=avx2,+fma -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver2 -mattr=-fma4 -fp-contract=fast | FileCheck %s
+; RUN: llc < %s -mtriple=x86_64-unknown-unknown -mcpu=bdver1 -fp-contract=fast | FileCheck %s --check-prefix=CHECK_FMA4
 
 ; CHECK: test_x86_fmadd_ps
-; CHECK: vfmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: vfmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fadd <4 x float> %x, %a2
@@ -10,8 +15,11 @@ define <4 x float> @test_x86_fmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fmsub_ps
-; CHECK: fmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %x, %a2
@@ -19,8 +27,11 @@ define <4 x float> @test_x86_fmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x flo
 }
 
 ; CHECK: test_x86_fnmadd_ps
-; CHECK: fnmadd213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmadd213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps
+; CHECK_FMA4: vfnmaddps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %res = fsub <4 x float> %a2, %x
@@ -28,8 +39,11 @@ define <4 x float> @test_x86_fnmadd_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fnmsub_ps
-; CHECK: fnmsub213ps     %xmm2, %xmm0, %xmm1
+; CHECK: fnmsub213ps     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ps
+; CHECK_FMA4: fnmsubps     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x float> %a2) {
   %x = fmul <4 x float> %a0, %a1
   %y = fsub <4 x float> <float -0.000000e+00, float -0.000000e+00, float -0.000000e+00, float -0.000000e+00>, %x
@@ -38,8 +52,11 @@ define <4 x float> @test_x86_fnmsub_ps(<4 x float> %a0, <4 x float> %a1, <4 x fl
 }
 
 ; CHECK: test_x86_fmadd_ps_y
-; CHECK: vfmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps_y
+; CHECK_FMA4: vfmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fadd <8 x float> %x, %a2
@@ -47,8 +64,11 @@ define <8 x float> @test_x86_fmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fmsub_ps_y
-; CHECK: vfmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps_y
+; CHECK_FMA4: vfmsubps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %x, %a2
@@ -56,8 +76,11 @@ define <8 x float> @test_x86_fmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x f
 }
 
 ; CHECK: test_x86_fnmadd_ps_y
-; CHECK: vfnmadd213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmadd213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ps_y
+; CHECK_FMA4: vfnmaddps     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
   %res = fsub <8 x float> %a2, %x
@@ -65,7 +88,7 @@ define <8 x float> @test_x86_fnmadd_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fnmsub_ps_y
-; CHECK: vfnmsub213ps     %ymm2, %ymm0, %ymm1
+; CHECK: vfnmsub213ps     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
 define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x float> %a2) {
   %x = fmul <8 x float> %a0, %a1
@@ -75,8 +98,11 @@ define <8 x float> @test_x86_fnmsub_ps_y(<8 x float> %a0, <8 x float> %a1, <8 x
 }
 
 ; CHECK: test_x86_fmadd_pd_y
-; CHECK: vfmadd213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmadd213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_pd_y
+; CHECK_FMA4: vfmaddpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fadd <4 x double> %x, %a2
@@ -84,8 +110,11 @@ define <4 x double> @test_x86_fmadd_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd_y
-; CHECK: vfmsub213pd     %ymm2, %ymm0, %ymm1
+; CHECK: vfmsub213pd     %ymm2, %ymm1, %ymm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd_y
+; CHECK_FMA4: vfmsubpd     %ymm2, %ymm1, %ymm0, %ymm0
+; CHECK_FMA4: ret
 define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4 x double> %a2) {
   %x = fmul <4 x double> %a0, %a1
   %res = fsub <4 x double> %x, %a2
@@ -93,8 +122,11 @@ define <4 x double> @test_x86_fmsub_pd_y(<4 x double> %a0, <4 x double> %a1, <4
 }
 
 ; CHECK: test_x86_fmsub_pd
-; CHECK: vfmsub213pd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213pd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_pd
+; CHECK_FMA4: vfmsubpd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x double> %a2) {
   %x = fmul <2 x double> %a0, %a1
   %res = fsub <2 x double> %x, %a2
@@ -102,8 +134,11 @@ define <2 x double> @test_x86_fmsub_pd(<2 x double> %a0, <2 x double> %a1, <2 x
 }
 
 ; CHECK: test_x86_fnmadd_ss
-; CHECK: vfnmadd213ss    %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213ss    %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_ss
+; CHECK_FMA4: vfnmaddss    %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
   %x = fmul float %a0, %a1
   %res = fsub float %a2, %x
@@ -111,8 +146,11 @@ define float @test_x86_fnmadd_ss(float %a0, float %a1, float %a2) {
 }
 
 ; CHECK: test_x86_fnmadd_sd
-; CHECK: vfnmadd213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmadd213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmadd_sd
+; CHECK_FMA4: vfnmaddsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %a2, %x
@@ -120,8 +158,11 @@ define double @test_x86_fnmadd_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fmsub_sd
-; CHECK: vfmsub213sd     %xmm2, %xmm0, %xmm1
+; CHECK: vfmsub213sd     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_sd
+; CHECK_FMA4: vfmsubsd     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
   %x = fmul double %a0, %a1
   %res = fsub double %x, %a2
@@ -129,11 +170,43 @@ define double @test_x86_fmsub_sd(double %a0, double %a1, double %a2) {
 }
 
 ; CHECK: test_x86_fnmsub_ss
-; CHECK: vfnmsub213ss     %xmm2, %xmm0, %xmm1
+; CHECK: vfnmsub213ss     %xmm2, %xmm1, %xmm0
 ; CHECK: ret
+; CHECK_FMA4: test_x86_fnmsub_ss
+; CHECK_FMA4: vfnmsubss     %xmm2, %xmm1, %xmm0, %xmm0
+; CHECK_FMA4: ret
 define float @test_x86_fnmsub_ss(float %a0, float %a1, float %a2) {
   %x = fsub float -0.000000e+00, %a0
   %y = fmul float %x, %a1
   %res = fsub float %y, %a2
   ret float %res
 }
+
+; CHECK: test_x86_fmadd_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: vfmadd213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmadd_ps
+; CHECK_FMA4: vfmaddps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmadd_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fadd <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
+; CHECK: test_x86_fmsub_ps
+; CHECK: vmovaps         (%rdi), %xmm2
+; CHECK: fmsub213ps     %xmm1, %xmm0, %xmm2
+; CHECK: ret
+; CHECK_FMA4: test_x86_fmsub_ps
+; CHECK_FMA4: vfmsubps     %xmm1, (%rdi), %xmm0, %xmm0
+; CHECK_FMA4: ret
+define <4 x float> @test_x86_fmsub_ps_load(<4 x float>* %a0, <4 x float> %a1, <4 x float> %a2) {
+  %x = load <4 x float>* %a0
+  %y = fmul <4 x float> %x, %a1
+  %res = fsub <4 x float> %y, %a2
+  ret <4 x float> %res
+}
+
diff --git a/test/CodeGen/X86/fp-fast.ll b/test/CodeGen/X86/fp-fast.ll
new file mode 100644
index 0000000..091f0de
--- /dev/null
+++ b/test/CodeGen/X86/fp-fast.ll
@@ -0,0 +1,37 @@
+; RUN: llc -march=x86-64 -mattr=-fma4 -mtriple=x86_64-apple-darwin -enable-unsafe-fp-math < %s | FileCheck %s
+
+; CHECK: test1
+define float @test1(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fadd float %a, %a
+  %r = fadd float %t1, %t1
+  ret float %r
+}
+
+; CHECK: test2
+define float @test2(float %a) {
+; CHECK-NOT: addss
+; CHECK: mulss
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 4.0, %a
+  %t2 = fadd float %a, %a
+  %r = fadd float %t1, %t2
+  ret float %r
+}
+
+; CHECK: test3
+define float @test3(float %a) {
+; CHECK-NOT: addss
+; CHECK: xorps
+; CHECK-NOT: addss
+; CHECK: ret
+  %t1 = fmul float 2.0, %a
+  %t2 = fadd float %a, %a
+  %r = fsub float %t1, %t2
+  ret float %r
+}
+
diff --git a/test/CodeGen/X86/inline-asm-tied.ll b/test/CodeGen/X86/inline-asm-tied.ll
index 91576fb..597236e 100644
--- a/test/CodeGen/X86/inline-asm-tied.ll
+++ b/test/CodeGen/X86/inline-asm-tied.ll
@@ -19,3 +19,12 @@ entry:
 	%1 = load i64* %retval		; <i64> [#uses=1]
 	ret i64 %1
 }
+
+; The tied operands are not necessarily in the same order as the defs.
+; PR13742
+define i64 @swapped(i64 %x, i64 %y) nounwind {
+entry:
+	%x0 = call { i64, i64 } asm "foo", "=r,=r,1,0,~{dirflag},~{fpsr},~{flags}"(i64 %x, i64 %y) nounwind
+        %x1 = extractvalue { i64, i64 } %x0, 0
+        ret i64 %x1
+}
diff --git a/test/CodeGen/X86/ms-inline-asm.ll b/test/CodeGen/X86/ms-inline-asm.ll
new file mode 100644
index 0000000..014132b
--- /dev/null
+++ b/test/CodeGen/X86/ms-inline-asm.ll
@@ -0,0 +1,14 @@
+; RUN: llc < %s -march=x86 | FileCheck %s
+
+define i32 @t1() nounwind {
+entry:
+  %0 = tail call i32 asm sideeffect inteldialect "mov eax, $1\0Amov $0, eax", "=r,r,~{eax},~{dirflag},~{fpsr},~{flags}"(i32 1) nounwind
+  ret i32 %0
+; CHECK: t1
+; CHECK: {{## InlineAsm Start|#APP}}
+; CHECK: .intel_syntax
+; CHECK: mov eax, ecx
+; CHECK: mov ecx, eax
+; CHECK: .att_syntax
+; CHECK: {{## InlineAsm End|#NO_APP}}
+}
diff --git a/test/CodeGen/X86/phys_subreg_coalesce-3.ll b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
index 984d7e5..51320dd 100644
--- a/test/CodeGen/X86/phys_subreg_coalesce-3.ll
+++ b/test/CodeGen/X86/phys_subreg_coalesce-3.ll
@@ -1,14 +1,10 @@
 ; RUN: llc < %s -mtriple=i386-apple-darwin | FileCheck %s
-; XFAIL: *
 ; rdar://5571034
 
 ; This requires physreg joining, %vreg13 is live everywhere:
 ; 304L		%CL<def> = COPY %vreg13:sub_8bit; GR32_ABCD:%vreg13
 ; 320L		%vreg15<def> = COPY %vreg19; GR32:%vreg15 GR32_NOSP:%vreg19
 ; 336L		%vreg15<def> = SAR32rCL %vreg15, %EFLAGS<imp-def,dead>, %CL<imp-use,kill>; GR32:%vreg15
-;
-; This test is XFAIL until the register allocator understands trivial physreg
-; interference. <rdar://9802098>
 
 define void @foo(i32* nocapture %quadrant, i32* nocapture %ptr, i32 %bbSize, i32 %bbStart, i32 %shifts) nounwind ssp {
 ; CHECK: foo:
diff --git a/test/CodeGen/X86/pr12312.ll b/test/CodeGen/X86/pr12312.ll
new file mode 100644
index 0000000..84102f1
--- /dev/null
+++ b/test/CodeGen/X86/pr12312.ll
@@ -0,0 +1,48 @@
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+sse41,-avx < %s | FileCheck %s --check-prefix SSE41
+; RUN: llc -mtriple=x86_64-unknown-unknown -mattr=+avx < %s | FileCheck %s --check-prefix AVX
+
+define i32 @veccond(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  br i1 %1, label %if-true-block, label %endif-block
+
+if-true-block:                                    ; preds = %entry
+  ret i32 0
+endif-block:                                      ; preds = %entry,
+  ret i32 1
+; SSE41: veccond
+; SSE41: ptest
+; SSE41: ret
+; AVX:   veccond
+; AVX:   vptest
+; AVX:   ret
+}
+
+define i32 @vectest(<4 x i32> %input) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = zext i1 %1 to i32
+  ret i32 %2
+; SSE41: vectest
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vectest
+; AVX:   vptest
+; AVX:   ret
+}
+
+define i32 @vecsel(<4 x i32> %input, i32 %a, i32 %b) {
+entry:
+  %0 = bitcast <4 x i32> %input to i128
+  %1 = icmp ne i128 %0, 0
+  %2 = select i1 %1, i32 %a, i32 %b
+  ret i32 %2
+; SSE41: vecsel
+; SSE41: ptest
+; SSE41: ret
+; AVX:   vecsel
+; AVX:   vptest
+; AVX:   ret
+}
diff --git a/test/CodeGen/X86/pr12359.ll b/test/CodeGen/X86/pr12359.ll
new file mode 100644
index 0000000..024b163
--- /dev/null
+++ b/test/CodeGen/X86/pr12359.ll
@@ -0,0 +1,10 @@
+; RUN: llc -asm-verbose -mtriple=x86_64-unknown-unknown -mcpu=corei7 < %s | FileCheck %s
+define <16 x i8> @shuf(<16 x i8> %inval1) {
+entry:
+  %0 = shufflevector <16 x i8> %inval1, <16 x i8> zeroinitializer, <16 x i32> <i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4, i32 0, i32 4, i32 3, i32 2, i32 16, i32 16, i32 3, i32 4>
+  ret <16 x i8> %0
+; CHECK: shuf
+; CHECK: # BB#0: # %entry
+; CHECK-NEXT: pshufb
+; CHECK-NEXT: ret
+}
diff --git a/test/CodeGen/X86/tls-pic.ll b/test/CodeGen/X86/tls-pic.ll
index 51c3d23..b823f0a 100644
--- a/test/CodeGen/X86/tls-pic.ll
+++ b/test/CodeGen/X86/tls-pic.ll
@@ -76,12 +76,12 @@ entry:
 
 ; X32:    f5:
 ; X32:      leal {{[jk]}}@TLSLDM(%ebx)
-; X32-NEXT: calll ___tls_get_addr@PLT
-; X32-NEXT: movl {{[jk]}}@DTPOFF(%eax)
-; X32-NEXT: addl {{[jk]}}@DTPOFF(%eax)
+; X32: calll ___tls_get_addr@PLT
+; X32: movl {{[jk]}}@DTPOFF(%e
+; X32: addl {{[jk]}}@DTPOFF(%e
 
 ; X64:    f5:
 ; X64:      leaq {{[jk]}}@TLSLD(%rip), %rdi
-; X64-NEXT: callq	__tls_get_addr@PLT
-; X64-NEXT: movl {{[jk]}}@DTPOFF(%rax)
-; X64-NEXT: addl {{[jk]}}@DTPOFF(%rax)
+; X64: callq	__tls_get_addr@PLT
+; X64: movl {{[jk]}}@DTPOFF(%r
+; X64: addl {{[jk]}}@DTPOFF(%r
diff --git a/test/CodeGen/X86/vec_fabs.ll b/test/CodeGen/X86/vec_fabs.ll
new file mode 100644
index 0000000..82517cb
--- /dev/null
+++ b/test/CodeGen/X86/vec_fabs.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @fabs_v2f64(<2 x double> %p)
+{
+  ; CHECK: fabs_v2f64
+  ; CHECK: vandps
+  %t = call <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.fabs.v2f64(<2 x double> %p)
+
+define <4 x float> @fabs_v4f32(<4 x float> %p)
+{
+  ; CHECK: fabs_v4f32
+  ; CHECK: vandps
+  %t = call <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.fabs.v4f32(<4 x float> %p)
+
+define <4 x double> @fabs_v4f64(<4 x double> %p)
+{
+  ; CHECK: fabs_v4f64
+  ; CHECK: vandps
+  %t = call <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.fabs.v4f64(<4 x double> %p)
+
+define <8 x float> @fabs_v8f32(<8 x float> %p)
+{
+  ; CHECK: fabs_v8f32
+  ; CHECK: vandps
+  %t = call <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.fabs.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_floor.ll b/test/CodeGen/X86/vec_floor.ll
new file mode 100644
index 0000000..5e0160b
--- /dev/null
+++ b/test/CodeGen/X86/vec_floor.ll
@@ -0,0 +1,38 @@
+; RUN: llc < %s -mtriple=x86_64-apple-darwin -march=x86 -mcpu=corei7-avx | FileCheck %s
+
+
+define <2 x double> @floor_v2f64(<2 x double> %p)
+{
+  ; CHECK: floor_v2f64
+  ; CHECK: vroundpd
+  %t = call <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+  ret <2 x double> %t
+}
+declare <2 x double> @llvm.floor.v2f64(<2 x double> %p)
+
+define <4 x float> @floor_v4f32(<4 x float> %p)
+{
+  ; CHECK: floor_v4f32
+  ; CHECK: vroundps
+  %t = call <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+  ret <4 x float> %t
+}
+declare <4 x float> @llvm.floor.v4f32(<4 x float> %p)
+
+define <4 x double> @floor_v4f64(<4 x double> %p)
+{
+  ; CHECK: floor_v4f64
+  ; CHECK: vroundpd
+  %t = call <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+  ret <4 x double> %t
+}
+declare <4 x double> @llvm.floor.v4f64(<4 x double> %p)
+
+define <8 x float> @floor_v8f32(<8 x float> %p)
+{
+  ; CHECK: floor_v8f32
+  ; CHECK: vroundps
+  %t = call <8 x float> @llvm.floor.v8f32(<8 x float> %p)
+  ret <8 x float> %t
+}
+declare <8 x float> @llvm.floor.v8f32(<8 x float> %p)
diff --git a/test/CodeGen/X86/vec_fpext.ll b/test/CodeGen/X86/vec_fpext.ll
index 05b263e..dc0464f 100644
--- a/test/CodeGen/X86/vec_fpext.ll
+++ b/test/CodeGen/X86/vec_fpext.ll
@@ -1,14 +1,38 @@
 ; RUN: llc < %s -march=x86 -mattr=+sse41,-avx | FileCheck %s
+; RUN: llc < %s -march=x86 -mattr=+avx | FileCheck --check-prefix=AVX %s
 
 ; PR11674
 define void @fpext_frommem(<2 x float>* %in, <2 x double>* %out) {
 entry:
-; TODO: We should be able to generate cvtps2pd for the load.
-; For now, just check that we generate something sane.
-; CHECK: cvtss2sd
-; CHECK: cvtss2sd
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
   %0 = load <2 x float>* %in, align 8
   %1 = fpext <2 x float> %0 to <2 x double>
   store <2 x double> %1, <2 x double>* %out, align 1
   ret void
 }
+
+define void @fpext_frommem4(<4 x float>* %in, <4 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <4 x float>* %in
+  %1 = fpext <4 x float> %0 to <4 x double>
+  store <4 x double> %1, <4 x double>* %out, align 1
+  ret void
+}
+
+define void @fpext_frommem8(<8 x float>* %in, <8 x double>* %out) {
+entry:
+; CHECK: cvtps2pd (%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 8(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 16(%{{.+}}), %xmm{{[0-9]+}}
+; CHECK: cvtps2pd 24(%{{.+}}), %xmm{{[0-9]+}}
+; AVX: vcvtps2pd (%{{.+}}), %ymm{{[0-9]+}}
+; AVX: vcvtps2pd 16(%{{.+}}), %ymm{{[0-9]+}}
+  %0 = load <8 x float>* %in
+  %1 = fpext <8 x float> %0 to <8 x double>
+  store <8 x double> %1, <8 x double>* %out, align 1
+  ret void
+}
diff --git a/test/CodeGen/X86/vec_shuffle-26.ll b/test/CodeGen/X86/vec_shuffle-26.ll
index 086af6b..8dfc2ea 100644
--- a/test/CodeGen/X86/vec_shuffle-26.ll
+++ b/test/CodeGen/X86/vec_shuffle-26.ll
@@ -1,6 +1,5 @@
-; RUN: llc < %s -march=x86 -mattr=sse41 -o %t
-; RUN: grep unpcklps %t | count 1
-; RUN: grep unpckhps %t | count 3
+; RUN: llc < %s -march=x86 -mcpu=generic -mattr=sse41 | FileCheck %s
+; RUN: llc < %s -march=x86 -mcpu=atom -mattr=+sse41 | FileCheck -check-prefix=ATOM %s
 
 ; Transpose example using the more generic vector shuffle. Return float8
 ; instead of float16
@@ -14,6 +13,17 @@ target triple = "i386-apple-cl.1.0"
 
 define <8 x float> @__transpose2(<4 x float> %p0, <4 x float> %p1, <4 x float> %p2, <4 x float> %p3) nounwind {
 entry:
+; CHECK: transpose2
+; CHECK: unpckhps
+; CHECK: unpckhps
+; CHECK: unpcklps
+; CHECK: unpckhps
+; Different instruction order for Atom.
+; ATOM: transpose2
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpckhps
+; ATOM: unpcklps
 	%unpcklps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
 	%unpckhps = shufflevector <4 x float> %p0, <4 x float> %p2, <4 x i32> < i32 2, i32 6, i32 3, i32 7 >		; <<4 x float>> [#uses=2]
 	%unpcklps8 = shufflevector <4 x float> %p1, <4 x float> %p3, <4 x i32> < i32 0, i32 4, i32 1, i32 5 >		; <<4 x float>> [#uses=2]
@@ -27,3 +37,32 @@ entry:
 ;       %r3 = shufflevector <8 x float> %r1,  <8 x float> %r2,  <16 x i32> < i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7, i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15 >; 
 	ret <8 x float> %r2
 }
+
+define <2 x i64> @lo_hi_shift(float* nocapture %x, float* nocapture %y) nounwind {
+entry:
+; movhps should happen before extractps to assure it gets the correct value.
+; CHECK: lo_hi_shift
+; CHECK: movhps ([[BASEREG:%[a-z]+]]),
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; CHECK: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: lo_hi_shift
+; ATOM: movhps ([[BASEREG:%[a-z]+]]),
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+; ATOM: extractps ${{[0-9]+}}, %xmm{{[0-9]+}}, {{[0-9]*}}([[BASEREG]])
+  %v.i = bitcast float* %y to <4 x float>*
+  %0 = load <4 x float>* %v.i, align 1
+  %1 = bitcast float* %x to <1 x i64>*
+  %.val = load <1 x i64>* %1, align 1
+  %2 = bitcast <1 x i64> %.val to <2 x float>
+  %shuffle.i = shufflevector <2 x float> %2, <2 x float> undef, <4 x i32> <i32 0, i32 1, i32 undef, i32 undef>
+  %shuffle1.i = shufflevector <4 x float> %0, <4 x float> %shuffle.i, <4 x i32> <i32 0, i32 1, i32 4, i32 5>
+  %cast.i = bitcast <4 x float> %0 to <2 x i64>
+  %extract.i = extractelement <2 x i64> %cast.i, i32 1
+  %3 = bitcast float* %x to i64*
+  store i64 %extract.i, i64* %3, align 4
+  %4 = bitcast <4 x float> %0 to <16 x i8>
+  %5 = bitcast <4 x float> %shuffle1.i to <16 x i8>
+  %palignr = shufflevector <16 x i8> %5, <16 x i8> %4, <16 x i32> <i32 8, i32 9, i32 10, i32 11, i32 12, i32 13, i32 14, i32 15, i32 16, i32 17, i32 18, i32 19, i32 20, i32 21, i32 22, i32 23>
+  %6 = bitcast <16 x i8> %palignr to <2 x i64>
+  ret <2 x i64> %6
+}
diff --git a/test/CodeGen/X86/widen_load-1.ll b/test/CodeGen/X86/widen_load-1.ll
index 9705d14..dfaa3d6 100644
--- a/test/CodeGen/X86/widen_load-1.ll
+++ b/test/CodeGen/X86/widen_load-1.ll
@@ -1,12 +1,17 @@
-; RUN: llc %s -o - -march=x86-64 -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+; RUN: llc %s -o - -march=x86-64 -mattr=-avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=SSE
+; RUN: llc %s -o - -march=x86-64 -mattr=+avx -mtriple=x86_64-unknown-linux-gnu | FileCheck %s --check-prefix=AVX
 ; PR4891
 ; PR5626
 
 ; This load should be before the call, not after.
 
-; CHECK: movaps    compl+128(%rip), %xmm0
-; CHECK: movaps  %xmm0, (%rsp)
-; CHECK: callq   killcommon
+; SSE: movaps    compl+128(%rip), %xmm0
+; SSE: movaps  %xmm0, (%rsp)
+; SSE: callq   killcommon
+
+; AVX: vmovapd    compl+128(%rip), %xmm0
+; AVX: vmovapd  %xmm0, (%rsp)
+; AVX: callq   killcommon
 
 @compl = linkonce global [20 x i64] zeroinitializer, align 64 ; <[20 x i64]*> [#uses=1]